1from test.test_support import verbose, run_unittest, import_module 2import re 3from re import Scanner 4import sys 5import string 6import traceback 7from weakref import proxy 8 9# Misc tests from Tim Peters' re.doc 10 11# WARNING: Don't change details in these tests if you don't know 12# what you're doing. Some of these tests were carefully modeled to 13# cover most of the code. 14 15import unittest 16 17class ReTests(unittest.TestCase): 18 19 def test_weakref(self): 20 s = 'QabbbcR' 21 x = re.compile('ab+c') 22 y = proxy(x) 23 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 24 25 def test_search_star_plus(self): 26 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 27 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 28 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 29 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 30 self.assertEqual(re.search('x', 'aaa'), None) 31 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 32 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 33 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 34 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 35 self.assertEqual(re.match('a+', 'xxx'), None) 36 37 def bump_num(self, matchobj): 38 int_value = int(matchobj.group(0)) 39 return str(int_value + 1) 40 41 def test_basic_re_sub(self): 42 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 43 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 44 '9.3 -3 24x100y') 45 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 46 '9.3 -3 23x99y') 47 48 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 49 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 50 51 s = r"\1\1" 52 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 53 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) 54 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 55 56 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx') 57 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx') 58 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') 59 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') 60 61 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), 62 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') 63 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') 64 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), 65 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) 66 67 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') 68 69 def test_bug_449964(self): 70 # fails for group followed by other escape 71 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'), 72 'xx\bxx\b') 73 74 def test_bug_449000(self): 75 # Test for sub() on escaped characters 76 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 77 'abc\ndef\n') 78 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 79 'abc\ndef\n') 80 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 81 'abc\ndef\n') 82 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 83 'abc\ndef\n') 84 85 def test_bug_1140(self): 86 # re.sub(x, y, u'') should return u'', not '', and 87 # re.sub(x, y, '') should return '', not u''. 88 # Also: 89 # re.sub(x, y, unicode(x)) should return unicode(y), and 90 # re.sub(x, y, str(x)) should return 91 # str(y) if isinstance(y, str) else unicode(y). 92 for x in 'x', u'x': 93 for y in 'y', u'y': 94 z = re.sub(x, y, u'') 95 self.assertEqual(z, u'') 96 self.assertEqual(type(z), unicode) 97 # 98 z = re.sub(x, y, '') 99 self.assertEqual(z, '') 100 self.assertEqual(type(z), str) 101 # 102 z = re.sub(x, y, unicode(x)) 103 self.assertEqual(z, y) 104 self.assertEqual(type(z), unicode) 105 # 106 z = re.sub(x, y, str(x)) 107 self.assertEqual(z, y) 108 self.assertEqual(type(z), type(y)) 109 110 def test_bug_1661(self): 111 # Verify that flags do not get silently ignored with compiled patterns 112 pattern = re.compile('.') 113 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 114 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 115 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 116 self.assertRaises(ValueError, re.compile, pattern, re.I) 117 118 def test_bug_3629(self): 119 # A regex that triggered a bug in the sre-code validator 120 re.compile("(?P<quote>)(?(quote))") 121 122 def test_sub_template_numeric_escape(self): 123 # bug 776311 and friends 124 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 125 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 126 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 127 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 128 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 129 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 130 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 131 132 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 133 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 134 135 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 136 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 137 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 138 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 139 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 140 141 self.assertEqual(re.sub('x', r'\400', 'x'), '\0') 142 self.assertEqual(re.sub('x', r'\777', 'x'), '\377') 143 144 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') 145 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') 146 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') 147 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') 148 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') 149 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') 150 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') 151 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') 152 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' 153 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') 154 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' 155 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' 156 157 # in python2.3 (etc), these loop endlessly in sre_parser.py 158 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 159 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 160 'xz8') 161 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 162 'xza') 163 164 def test_qualified_re_sub(self): 165 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 166 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 167 168 def test_bug_114660(self): 169 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 170 'hello there') 171 172 def test_bug_462270(self): 173 # Test for empty sub() behaviour, see SF bug #462270 174 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') 175 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') 176 177 def test_symbolic_refs(self): 178 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') 179 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx') 180 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx') 181 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx') 182 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx') 183 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx') 184 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx') 185 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx') 186 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx') 187 188 def test_re_subn(self): 189 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 190 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 191 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 192 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 193 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 194 195 def test_re_split(self): 196 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) 197 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) 198 self.assertEqual(re.split("(:*)", ":a:b::c"), 199 ['', ':', 'a', ':', 'b', '::', 'c']) 200 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) 201 self.assertEqual(re.split("(:)*", ":a:b::c"), 202 ['', ':', 'a', ':', 'b', ':', 'c']) 203 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 204 ['', ':', 'a', ':b::', 'c']) 205 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 206 ['', None, ':', 'a', None, ':', '', 'b', None, '', 207 None, '::', 'c']) 208 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 209 ['', 'a', '', '', 'c']) 210 211 def test_qualified_re_split(self): 212 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 213 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) 214 self.assertEqual(re.split("(:)", ":a:b::c", 2), 215 ['', ':', 'a', ':', 'b::c']) 216 self.assertEqual(re.split("(:*)", ":a:b::c", 2), 217 ['', ':', 'a', ':', 'b::c']) 218 219 def test_re_findall(self): 220 self.assertEqual(re.findall(":+", "abc"), []) 221 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) 222 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) 223 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), 224 (":", ":"), 225 (":", "::")]) 226 227 def test_bug_117612(self): 228 self.assertEqual(re.findall(r"(a|(b))", "aba"), 229 [("a", ""),("b", "b"),("a", "")]) 230 231 def test_re_match(self): 232 self.assertEqual(re.match('a', 'a').groups(), ()) 233 self.assertEqual(re.match('(a)', 'a').groups(), ('a',)) 234 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a') 235 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a') 236 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a')) 237 238 pat = re.compile('((a)|(b))(c)?') 239 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 240 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 241 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 242 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 243 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 244 245 # A single group 246 m = re.match('(a)', 'a') 247 self.assertEqual(m.group(0), 'a') 248 self.assertEqual(m.group(0), 'a') 249 self.assertEqual(m.group(1), 'a') 250 self.assertEqual(m.group(1, 1), ('a', 'a')) 251 252 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 253 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 254 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 255 (None, 'b', None)) 256 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 257 258 def test_re_groupref_exists(self): 259 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 260 ('(', 'a')) 261 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), 262 (None, 'a')) 263 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None) 264 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None) 265 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 266 ('a', 'b')) 267 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 268 (None, 'd')) 269 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 270 (None, 'd')) 271 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(), 272 ('a', '')) 273 274 # Tests for bug #1177831: exercise groups other than the first group 275 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 276 self.assertEqual(p.match('abc').groups(), 277 ('a', 'b', 'c')) 278 self.assertEqual(p.match('ad').groups(), 279 ('a', None, 'd')) 280 self.assertEqual(p.match('abd'), None) 281 self.assertEqual(p.match('ac'), None) 282 283 284 def test_re_groupref(self): 285 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 286 ('|', 'a')) 287 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 288 (None, 'a')) 289 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None) 290 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None) 291 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 292 ('a', 'a')) 293 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 294 (None, None)) 295 296 def test_groupdict(self): 297 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 298 'first second').groupdict(), 299 {'first':'first', 'second':'second'}) 300 301 def test_expand(self): 302 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 303 "first second") 304 .expand(r"\2 \1 \g<second> \g<first>"), 305 "second first second first") 306 307 def test_repeat_minmax(self): 308 self.assertEqual(re.match("^(\w){1}$", "abc"), None) 309 self.assertEqual(re.match("^(\w){1}?$", "abc"), None) 310 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None) 311 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None) 312 313 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c") 314 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c") 315 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c") 316 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 317 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c") 318 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c") 319 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c") 320 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 321 322 self.assertEqual(re.match("^x{1}$", "xxx"), None) 323 self.assertEqual(re.match("^x{1}?$", "xxx"), None) 324 self.assertEqual(re.match("^x{1,2}$", "xxx"), None) 325 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None) 326 327 self.assertNotEqual(re.match("^x{3}$", "xxx"), None) 328 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None) 329 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None) 330 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) 331 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None) 332 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None) 333 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None) 334 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) 335 336 self.assertEqual(re.match("^x{}$", "xxx"), None) 337 self.assertNotEqual(re.match("^x{}$", "x{}"), None) 338 339 def test_getattr(self): 340 self.assertEqual(re.match("(a)", "a").pos, 0) 341 self.assertEqual(re.match("(a)", "a").endpos, 1) 342 self.assertEqual(re.match("(a)", "a").string, "a") 343 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 344 self.assertNotEqual(re.match("(a)", "a").re, None) 345 346 def test_special_escapes(self): 347 self.assertEqual(re.search(r"\b(b.)\b", 348 "abcd abc bcd bx").group(1), "bx") 349 self.assertEqual(re.search(r"\B(b.)\B", 350 "abc bcd bc abxd").group(1), "bx") 351 self.assertEqual(re.search(r"\b(b.)\b", 352 "abcd abc bcd bx", re.LOCALE).group(1), "bx") 353 self.assertEqual(re.search(r"\B(b.)\B", 354 "abc bcd bc abxd", re.LOCALE).group(1), "bx") 355 self.assertEqual(re.search(r"\b(b.)\b", 356 "abcd abc bcd bx", re.UNICODE).group(1), "bx") 357 self.assertEqual(re.search(r"\B(b.)\B", 358 "abc bcd bc abxd", re.UNICODE).group(1), "bx") 359 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 360 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 361 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) 362 self.assertEqual(re.search(r"\b(b.)\b", 363 u"abcd abc bcd bx").group(1), "bx") 364 self.assertEqual(re.search(r"\B(b.)\B", 365 u"abc bcd bc abxd").group(1), "bx") 366 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") 367 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") 368 self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None) 369 self.assertEqual(re.search(r"\d\D\w\W\s\S", 370 "1aa! a").group(0), "1aa! a") 371 self.assertEqual(re.search(r"\d\D\w\W\s\S", 372 "1aa! a", re.LOCALE).group(0), "1aa! a") 373 self.assertEqual(re.search(r"\d\D\w\W\s\S", 374 "1aa! a", re.UNICODE).group(0), "1aa! a") 375 376 def test_bigcharset(self): 377 self.assertEqual(re.match(u"([\u2222\u2223])", 378 u"\u2222").group(1), u"\u2222") 379 self.assertEqual(re.match(u"([\u2222\u2223])", 380 u"\u2222", re.UNICODE).group(1), u"\u2222") 381 382 def test_anyall(self): 383 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 384 "a\nb") 385 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 386 "a\n\nb") 387 388 def test_non_consuming(self): 389 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") 390 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") 391 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") 392 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a") 393 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 394 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 395 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 396 397 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 398 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 399 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 400 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 401 402 def test_ignore_case(self): 403 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 404 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 405 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 406 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 407 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 408 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 409 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 410 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 411 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 412 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 413 414 def test_category(self): 415 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 416 417 def test_getlower(self): 418 import _sre 419 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) 420 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) 421 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) 422 423 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 424 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 425 426 def test_not_literal(self): 427 self.assertEqual(re.search("\s([^a])", " b").group(1), "b") 428 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb") 429 430 def test_search_coverage(self): 431 self.assertEqual(re.search("\s(b)", " b").group(1), "b") 432 self.assertEqual(re.search("a\s", "a ").group(0), "a ") 433 434 def assertMatch(self, pattern, text, match=None, span=None, 435 matcher=re.match): 436 if match is None and span is None: 437 # the pattern matches the whole text 438 match = text 439 span = (0, len(text)) 440 elif match is None or span is None: 441 raise ValueError('If match is not None, span should be specified ' 442 '(and vice versa).') 443 m = matcher(pattern, text) 444 self.assertTrue(m) 445 self.assertEqual(m.group(), match) 446 self.assertEqual(m.span(), span) 447 448 def test_re_escape(self): 449 alnum_chars = string.ascii_letters + string.digits 450 p = u''.join(unichr(i) for i in range(256)) 451 for c in p: 452 if c in alnum_chars: 453 self.assertEqual(re.escape(c), c) 454 elif c == u'\x00': 455 self.assertEqual(re.escape(c), u'\\000') 456 else: 457 self.assertEqual(re.escape(c), u'\\' + c) 458 self.assertMatch(re.escape(c), c) 459 self.assertMatch(re.escape(p), p) 460 461 def test_re_escape_byte(self): 462 alnum_chars = (string.ascii_letters + string.digits).encode('ascii') 463 p = ''.join(chr(i) for i in range(256)) 464 for b in p: 465 if b in alnum_chars: 466 self.assertEqual(re.escape(b), b) 467 elif b == b'\x00': 468 self.assertEqual(re.escape(b), b'\\000') 469 else: 470 self.assertEqual(re.escape(b), b'\\' + b) 471 self.assertMatch(re.escape(b), b) 472 self.assertMatch(re.escape(p), p) 473 474 def test_re_escape_non_ascii(self): 475 s = u'xxx\u2620\u2620\u2620xxx' 476 s_escaped = re.escape(s) 477 self.assertEqual(s_escaped, u'xxx\\\u2620\\\u2620\\\u2620xxx') 478 self.assertMatch(s_escaped, s) 479 self.assertMatch(u'.%s+.' % re.escape(u'\u2620'), s, 480 u'x\u2620\u2620\u2620x', (2, 7), re.search) 481 482 def test_re_escape_non_ascii_bytes(self): 483 b = u'y\u2620y\u2620y'.encode('utf-8') 484 b_escaped = re.escape(b) 485 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') 486 self.assertMatch(b_escaped, b) 487 res = re.findall(re.escape(u'\u2620'.encode('utf-8')), b) 488 self.assertEqual(len(res), 2) 489 490 def test_pickling(self): 491 import pickle 492 self.pickle_test(pickle) 493 import cPickle 494 self.pickle_test(cPickle) 495 # old pickles expect the _compile() reconstructor in sre module 496 import_module("sre", deprecated=True) 497 from sre import _compile 498 499 def pickle_test(self, pickle): 500 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') 501 s = pickle.dumps(oldpat) 502 newpat = pickle.loads(s) 503 self.assertEqual(oldpat, newpat) 504 505 def test_constants(self): 506 self.assertEqual(re.I, re.IGNORECASE) 507 self.assertEqual(re.L, re.LOCALE) 508 self.assertEqual(re.M, re.MULTILINE) 509 self.assertEqual(re.S, re.DOTALL) 510 self.assertEqual(re.X, re.VERBOSE) 511 512 def test_flags(self): 513 for flag in [re.I, re.M, re.X, re.S, re.L]: 514 self.assertNotEqual(re.compile('^pattern$', flag), None) 515 516 def test_sre_character_literals(self): 517 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 518 self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None) 519 self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None) 520 self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None) 521 self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) 522 self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) 523 self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) 524 self.assertRaises(re.error, re.match, "\911", "") 525 526 def test_sre_character_class_literals(self): 527 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 528 self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) 529 self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None) 530 self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None) 531 self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) 532 self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) 533 self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) 534 self.assertRaises(re.error, re.match, "[\911]", "") 535 536 def test_bug_113254(self): 537 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 538 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 539 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 540 541 def test_bug_527371(self): 542 # bug described in patches 527371/672491 543 self.assertEqual(re.match(r'(a)?a','a').lastindex, None) 544 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 545 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 546 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a') 547 self.assertEqual(re.match("((a))", "a").lastindex, 1) 548 549 def test_bug_545855(self): 550 # bug 545855 -- This pattern failed to cause a compile error as it 551 # should, instead provoking a TypeError. 552 self.assertRaises(re.error, re.compile, 'foo[a-') 553 554 def test_bug_418626(self): 555 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 556 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 557 # pattern '*?' on a long string. 558 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 559 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 560 20003) 561 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 562 # non-simple '*?' still used to hit the recursion limit, before the 563 # non-recursive scheme was implemented. 564 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 565 566 def test_bug_612074(self): 567 pat=u"["+re.escape(u"\u2039")+u"]" 568 self.assertEqual(re.compile(pat) and 1, 1) 569 570 def test_stack_overflow(self): 571 # nasty cases that used to overflow the straightforward recursive 572 # implementation of repeated groups. 573 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 574 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 575 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 576 577 def test_scanner(self): 578 def s_ident(scanner, token): return token 579 def s_operator(scanner, token): return "op%s" % token 580 def s_float(scanner, token): return float(token) 581 def s_int(scanner, token): return int(token) 582 583 scanner = Scanner([ 584 (r"[a-zA-Z_]\w*", s_ident), 585 (r"\d+\.\d*", s_float), 586 (r"\d+", s_int), 587 (r"=|\+|-|\*|/", s_operator), 588 (r"\s+", None), 589 ]) 590 591 self.assertNotEqual(scanner.scanner.scanner("").pattern, None) 592 593 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 594 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 595 'op+', 'bar'], '')) 596 597 def test_bug_448951(self): 598 # bug 448951 (similar to 429357, but with single char match) 599 # (Also test greedy matches.) 600 for op in '','?','*': 601 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 602 (None, None)) 603 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 604 ('a:', 'a')) 605 606 def test_bug_725106(self): 607 # capturing groups in alternatives in repeats 608 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 609 ('b', 'a')) 610 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 611 ('c', 'b')) 612 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 613 ('b', None)) 614 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 615 ('b', None)) 616 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 617 ('b', 'a')) 618 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 619 ('c', 'b')) 620 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 621 ('b', None)) 622 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 623 ('b', None)) 624 625 def test_bug_725149(self): 626 # mark_stack_base restoring before restoring marks 627 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 628 ('a', None)) 629 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 630 ('a', None, None)) 631 632 def test_bug_764548(self): 633 # bug 764548, re.compile() barfs on str/unicode subclasses 634 try: 635 unicode 636 except NameError: 637 return # no problem if we have no unicode 638 class my_unicode(unicode): pass 639 pat = re.compile(my_unicode("abc")) 640 self.assertEqual(pat.match("xyz"), None) 641 642 def test_finditer(self): 643 iter = re.finditer(r":+", "a:b::c:::d") 644 self.assertEqual([item.group(0) for item in iter], 645 [":", "::", ":::"]) 646 647 def test_bug_926075(self): 648 try: 649 unicode 650 except NameError: 651 return # no problem if we have no unicode 652 self.assertTrue(re.compile('bug_926075') is not 653 re.compile(eval("u'bug_926075'"))) 654 655 def test_bug_931848(self): 656 try: 657 unicode 658 except NameError: 659 pass 660 pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"') 661 self.assertEqual(re.compile(pattern).split("a.b.c"), 662 ['a','b','c']) 663 664 def test_bug_581080(self): 665 iter = re.finditer(r"\s", "a b") 666 self.assertEqual(iter.next().span(), (1,2)) 667 self.assertRaises(StopIteration, iter.next) 668 669 scanner = re.compile(r"\s").scanner("a b") 670 self.assertEqual(scanner.search().span(), (1, 2)) 671 self.assertEqual(scanner.search(), None) 672 673 def test_bug_817234(self): 674 iter = re.finditer(r".*", "asdf") 675 self.assertEqual(iter.next().span(), (0, 4)) 676 self.assertEqual(iter.next().span(), (4, 4)) 677 self.assertRaises(StopIteration, iter.next) 678 679 def test_bug_6561(self): 680 # '\d' should match characters in Unicode category 'Nd' 681 # (Number, Decimal Digit), but not those in 'Nl' (Number, 682 # Letter) or 'No' (Number, Other). 683 decimal_digits = [ 684 u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd' 685 u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' 686 u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 687 ] 688 for x in decimal_digits: 689 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x) 690 691 not_decimal_digits = [ 692 u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' 693 u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 694 u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No' 695 u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 696 ] 697 for x in not_decimal_digits: 698 self.assertIsNone(re.match('^\d$', x, re.UNICODE)) 699 700 def test_empty_array(self): 701 # SF buf 1647541 702 import array 703 for typecode in 'cbBuhHiIlLfd': 704 a = array.array(typecode) 705 self.assertEqual(re.compile("bla").match(a), None) 706 self.assertEqual(re.compile("").match(a).groups(), ()) 707 708 def test_inline_flags(self): 709 # Bug #1700 710 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow 711 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow 712 713 p = re.compile(upper_char, re.I | re.U) 714 q = p.match(lower_char) 715 self.assertNotEqual(q, None) 716 717 p = re.compile(lower_char, re.I | re.U) 718 q = p.match(upper_char) 719 self.assertNotEqual(q, None) 720 721 p = re.compile('(?i)' + upper_char, re.U) 722 q = p.match(lower_char) 723 self.assertNotEqual(q, None) 724 725 p = re.compile('(?i)' + lower_char, re.U) 726 q = p.match(upper_char) 727 self.assertNotEqual(q, None) 728 729 p = re.compile('(?iu)' + upper_char) 730 q = p.match(lower_char) 731 self.assertNotEqual(q, None) 732 733 p = re.compile('(?iu)' + lower_char) 734 q = p.match(upper_char) 735 self.assertNotEqual(q, None) 736 737 def test_dollar_matches_twice(self): 738 "$ matches the end of string, and just before the terminating \n" 739 pattern = re.compile('$') 740 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 741 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 742 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 743 744 pattern = re.compile('$', re.MULTILINE) 745 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 746 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 747 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 748 749 def test_dealloc(self): 750 # issue 3299: check for segfault in debug build 751 import _sre 752 # the overflow limit is different on wide and narrow builds and it 753 # depends on the definition of SRE_CODE (see sre.h). 754 # 2**128 should be big enough to overflow on both. For smaller values 755 # a RuntimeError is raised instead of OverflowError. 756 long_overflow = 2**128 757 self.assertRaises(TypeError, re.finditer, "a", {}) 758 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) 759 760def run_re_tests(): 761 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR 762 if verbose: 763 print 'Running re_tests test suite' 764 else: 765 # To save time, only run the first and last 10 tests 766 #tests = tests[:10] + tests[-10:] 767 pass 768 769 for t in tests: 770 sys.stdout.flush() 771 pattern = s = outcome = repl = expected = None 772 if len(t) == 5: 773 pattern, s, outcome, repl, expected = t 774 elif len(t) == 3: 775 pattern, s, outcome = t 776 else: 777 raise ValueError, ('Test tuples should have 3 or 5 fields', t) 778 779 try: 780 obj = re.compile(pattern) 781 except re.error: 782 if outcome == SYNTAX_ERROR: pass # Expected a syntax error 783 else: 784 print '=== Syntax error:', t 785 except KeyboardInterrupt: raise KeyboardInterrupt 786 except: 787 print '*** Unexpected error ***', t 788 if verbose: 789 traceback.print_exc(file=sys.stdout) 790 else: 791 try: 792 result = obj.search(s) 793 except re.error, msg: 794 print '=== Unexpected exception', t, repr(msg) 795 if outcome == SYNTAX_ERROR: 796 # This should have been a syntax error; forget it. 797 pass 798 elif outcome == FAIL: 799 if result is None: pass # No match, as expected 800 else: print '=== Succeeded incorrectly', t 801 elif outcome == SUCCEED: 802 if result is not None: 803 # Matched, as expected, so now we compute the 804 # result string and compare it to our expected result. 805 start, end = result.span(0) 806 vardict={'found': result.group(0), 807 'groups': result.group(), 808 'flags': result.re.flags} 809 for i in range(1, 100): 810 try: 811 gi = result.group(i) 812 # Special hack because else the string concat fails: 813 if gi is None: 814 gi = "None" 815 except IndexError: 816 gi = "Error" 817 vardict['g%d' % i] = gi 818 for i in result.re.groupindex.keys(): 819 try: 820 gi = result.group(i) 821 if gi is None: 822 gi = "None" 823 except IndexError: 824 gi = "Error" 825 vardict[i] = gi 826 repl = eval(repl, vardict) 827 if repl != expected: 828 print '=== grouping error', t, 829 print repr(repl) + ' should be ' + repr(expected) 830 else: 831 print '=== Failed incorrectly', t 832 833 # Try the match on a unicode string, and check that it 834 # still succeeds. 835 try: 836 result = obj.search(unicode(s, "latin-1")) 837 if result is None: 838 print '=== Fails on unicode match', t 839 except NameError: 840 continue # 1.5.2 841 except TypeError: 842 continue # unicode test case 843 844 # Try the match on a unicode pattern, and check that it 845 # still succeeds. 846 obj=re.compile(unicode(pattern, "latin-1")) 847 result = obj.search(s) 848 if result is None: 849 print '=== Fails on unicode pattern match', t 850 851 # Try the match with the search area limited to the extent 852 # of the match and see if it still succeeds. \B will 853 # break (because it won't match at the end or start of a 854 # string), so we'll ignore patterns that feature it. 855 856 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \ 857 and result is not None: 858 obj = re.compile(pattern) 859 result = obj.search(s, result.start(0), result.end(0) + 1) 860 if result is None: 861 print '=== Failed on range-limited match', t 862 863 # Try the match with IGNORECASE enabled, and check that it 864 # still succeeds. 865 obj = re.compile(pattern, re.IGNORECASE) 866 result = obj.search(s) 867 if result is None: 868 print '=== Fails on case-insensitive match', t 869 870 # Try the match with LOCALE enabled, and check that it 871 # still succeeds. 872 obj = re.compile(pattern, re.LOCALE) 873 result = obj.search(s) 874 if result is None: 875 print '=== Fails on locale-sensitive match', t 876 877 # Try the match with UNICODE locale enabled, and check 878 # that it still succeeds. 879 obj = re.compile(pattern, re.UNICODE) 880 result = obj.search(s) 881 if result is None: 882 print '=== Fails on unicode-sensitive match', t 883 884def test_main(): 885 run_unittest(ReTests) 886 run_re_tests() 887 888if __name__ == "__main__": 889 test_main() 890