1# 2# Secret Labs' Regular Expression Engine 3# 4# convert re-style regular expression to sre pattern 5# 6# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13# XXX: show string offset and offending character for all errors 14 15from sre_constants import * 16 17SPECIAL_CHARS = ".\\[{()*+?^$|" 18REPEAT_CHARS = "*+?{" 19 20DIGITS = frozenset("0123456789") 21 22OCTDIGITS = frozenset("01234567") 23HEXDIGITS = frozenset("0123456789abcdefABCDEF") 24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 25 26WHITESPACE = frozenset(" \t\n\r\v\f") 27 28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) 29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) 30 31ESCAPES = { 32 r"\a": (LITERAL, ord("\a")), 33 r"\b": (LITERAL, ord("\b")), 34 r"\f": (LITERAL, ord("\f")), 35 r"\n": (LITERAL, ord("\n")), 36 r"\r": (LITERAL, ord("\r")), 37 r"\t": (LITERAL, ord("\t")), 38 r"\v": (LITERAL, ord("\v")), 39 r"\\": (LITERAL, ord("\\")) 40} 41 42CATEGORIES = { 43 r"\A": (AT, AT_BEGINNING_STRING), # start of string 44 r"\b": (AT, AT_BOUNDARY), 45 r"\B": (AT, AT_NON_BOUNDARY), 46 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 47 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 48 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 49 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 50 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 51 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 52 r"\Z": (AT, AT_END_STRING), # end of string 53} 54 55FLAGS = { 56 # standard flags 57 "i": SRE_FLAG_IGNORECASE, 58 "L": SRE_FLAG_LOCALE, 59 "m": SRE_FLAG_MULTILINE, 60 "s": SRE_FLAG_DOTALL, 61 "x": SRE_FLAG_VERBOSE, 62 # extensions 63 "a": SRE_FLAG_ASCII, 64 "t": SRE_FLAG_TEMPLATE, 65 "u": SRE_FLAG_UNICODE, 66} 67 68TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE 69GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE 70 71class Verbose(Exception): 72 pass 73 74class Pattern: 75 # master pattern object. keeps track of global attributes 76 def __init__(self): 77 self.flags = 0 78 self.groupdict = {} 79 self.groupwidths = [None] # group 0 80 self.lookbehindgroups = None 81 @property 82 def groups(self): 83 return len(self.groupwidths) 84 def opengroup(self, name=None): 85 gid = self.groups 86 self.groupwidths.append(None) 87 if self.groups > MAXGROUPS: 88 raise error("too many groups") 89 if name is not None: 90 ogid = self.groupdict.get(name, None) 91 if ogid is not None: 92 raise error("redefinition of group name %r as group %d; " 93 "was group %d" % (name, gid, ogid)) 94 self.groupdict[name] = gid 95 return gid 96 def closegroup(self, gid, p): 97 self.groupwidths[gid] = p.getwidth() 98 def checkgroup(self, gid): 99 return gid < self.groups and self.groupwidths[gid] is not None 100 101 def checklookbehindgroup(self, gid, source): 102 if self.lookbehindgroups is not None: 103 if not self.checkgroup(gid): 104 raise source.error('cannot refer to an open group') 105 if gid >= self.lookbehindgroups: 106 raise source.error('cannot refer to group defined in the same ' 107 'lookbehind subpattern') 108 109class SubPattern: 110 # a subpattern, in intermediate form 111 def __init__(self, pattern, data=None): 112 self.pattern = pattern 113 if data is None: 114 data = [] 115 self.data = data 116 self.width = None 117 118 def dump(self, level=0): 119 nl = True 120 seqtypes = (tuple, list) 121 for op, av in self.data: 122 print(level*" " + str(op), end='') 123 if op is IN: 124 # member sublanguage 125 print() 126 for op, a in av: 127 print((level+1)*" " + str(op), a) 128 elif op is BRANCH: 129 print() 130 for i, a in enumerate(av[1]): 131 if i: 132 print(level*" " + "OR") 133 a.dump(level+1) 134 elif op is GROUPREF_EXISTS: 135 condgroup, item_yes, item_no = av 136 print('', condgroup) 137 item_yes.dump(level+1) 138 if item_no: 139 print(level*" " + "ELSE") 140 item_no.dump(level+1) 141 elif isinstance(av, seqtypes): 142 nl = False 143 for a in av: 144 if isinstance(a, SubPattern): 145 if not nl: 146 print() 147 a.dump(level+1) 148 nl = True 149 else: 150 if not nl: 151 print(' ', end='') 152 print(a, end='') 153 nl = False 154 if not nl: 155 print() 156 else: 157 print('', av) 158 def __repr__(self): 159 return repr(self.data) 160 def __len__(self): 161 return len(self.data) 162 def __delitem__(self, index): 163 del self.data[index] 164 def __getitem__(self, index): 165 if isinstance(index, slice): 166 return SubPattern(self.pattern, self.data[index]) 167 return self.data[index] 168 def __setitem__(self, index, code): 169 self.data[index] = code 170 def insert(self, index, code): 171 self.data.insert(index, code) 172 def append(self, code): 173 self.data.append(code) 174 def getwidth(self): 175 # determine the width (min, max) for this subpattern 176 if self.width is not None: 177 return self.width 178 lo = hi = 0 179 for op, av in self.data: 180 if op is BRANCH: 181 i = MAXREPEAT - 1 182 j = 0 183 for av in av[1]: 184 l, h = av.getwidth() 185 i = min(i, l) 186 j = max(j, h) 187 lo = lo + i 188 hi = hi + j 189 elif op is CALL: 190 i, j = av.getwidth() 191 lo = lo + i 192 hi = hi + j 193 elif op is SUBPATTERN: 194 i, j = av[-1].getwidth() 195 lo = lo + i 196 hi = hi + j 197 elif op in _REPEATCODES: 198 i, j = av[2].getwidth() 199 lo = lo + i * av[0] 200 hi = hi + j * av[1] 201 elif op in _UNITCODES: 202 lo = lo + 1 203 hi = hi + 1 204 elif op is GROUPREF: 205 i, j = self.pattern.groupwidths[av] 206 lo = lo + i 207 hi = hi + j 208 elif op is GROUPREF_EXISTS: 209 i, j = av[1].getwidth() 210 if av[2] is not None: 211 l, h = av[2].getwidth() 212 i = min(i, l) 213 j = max(j, h) 214 else: 215 i = 0 216 lo = lo + i 217 hi = hi + j 218 elif op is SUCCESS: 219 break 220 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 221 return self.width 222 223class Tokenizer: 224 def __init__(self, string): 225 self.istext = isinstance(string, str) 226 self.string = string 227 if not self.istext: 228 string = str(string, 'latin1') 229 self.decoded_string = string 230 self.index = 0 231 self.next = None 232 self.__next() 233 def __next(self): 234 index = self.index 235 try: 236 char = self.decoded_string[index] 237 except IndexError: 238 self.next = None 239 return 240 if char == "\\": 241 index += 1 242 try: 243 char += self.decoded_string[index] 244 except IndexError: 245 raise error("bad escape (end of pattern)", 246 self.string, len(self.string) - 1) from None 247 self.index = index + 1 248 self.next = char 249 def match(self, char): 250 if char == self.next: 251 self.__next() 252 return True 253 return False 254 def get(self): 255 this = self.next 256 self.__next() 257 return this 258 def getwhile(self, n, charset): 259 result = '' 260 for _ in range(n): 261 c = self.next 262 if c not in charset: 263 break 264 result += c 265 self.__next() 266 return result 267 def getuntil(self, terminator): 268 result = '' 269 while True: 270 c = self.next 271 self.__next() 272 if c is None: 273 if not result: 274 raise self.error("missing group name") 275 raise self.error("missing %s, unterminated name" % terminator, 276 len(result)) 277 if c == terminator: 278 if not result: 279 raise self.error("missing group name", 1) 280 break 281 result += c 282 return result 283 @property 284 def pos(self): 285 return self.index - len(self.next or '') 286 def tell(self): 287 return self.index - len(self.next or '') 288 def seek(self, index): 289 self.index = index 290 self.__next() 291 292 def error(self, msg, offset=0): 293 return error(msg, self.string, self.tell() - offset) 294 295def _class_escape(source, escape): 296 # handle escape code inside character class 297 code = ESCAPES.get(escape) 298 if code: 299 return code 300 code = CATEGORIES.get(escape) 301 if code and code[0] is IN: 302 return code 303 try: 304 c = escape[1:2] 305 if c == "x": 306 # hexadecimal escape (exactly two digits) 307 escape += source.getwhile(2, HEXDIGITS) 308 if len(escape) != 4: 309 raise source.error("incomplete escape %s" % escape, len(escape)) 310 return LITERAL, int(escape[2:], 16) 311 elif c == "u" and source.istext: 312 # unicode escape (exactly four digits) 313 escape += source.getwhile(4, HEXDIGITS) 314 if len(escape) != 6: 315 raise source.error("incomplete escape %s" % escape, len(escape)) 316 return LITERAL, int(escape[2:], 16) 317 elif c == "U" and source.istext: 318 # unicode escape (exactly eight digits) 319 escape += source.getwhile(8, HEXDIGITS) 320 if len(escape) != 10: 321 raise source.error("incomplete escape %s" % escape, len(escape)) 322 c = int(escape[2:], 16) 323 chr(c) # raise ValueError for invalid code 324 return LITERAL, c 325 elif c in OCTDIGITS: 326 # octal escape (up to three digits) 327 escape += source.getwhile(2, OCTDIGITS) 328 c = int(escape[1:], 8) 329 if c > 0o377: 330 raise source.error('octal escape value %s outside of ' 331 'range 0-0o377' % escape, len(escape)) 332 return LITERAL, c 333 elif c in DIGITS: 334 raise ValueError 335 if len(escape) == 2: 336 if c in ASCIILETTERS: 337 raise source.error('bad escape %s' % escape, len(escape)) 338 return LITERAL, ord(escape[1]) 339 except ValueError: 340 pass 341 raise source.error("bad escape %s" % escape, len(escape)) 342 343def _escape(source, escape, state): 344 # handle escape code in expression 345 code = CATEGORIES.get(escape) 346 if code: 347 return code 348 code = ESCAPES.get(escape) 349 if code: 350 return code 351 try: 352 c = escape[1:2] 353 if c == "x": 354 # hexadecimal escape 355 escape += source.getwhile(2, HEXDIGITS) 356 if len(escape) != 4: 357 raise source.error("incomplete escape %s" % escape, len(escape)) 358 return LITERAL, int(escape[2:], 16) 359 elif c == "u" and source.istext: 360 # unicode escape (exactly four digits) 361 escape += source.getwhile(4, HEXDIGITS) 362 if len(escape) != 6: 363 raise source.error("incomplete escape %s" % escape, len(escape)) 364 return LITERAL, int(escape[2:], 16) 365 elif c == "U" and source.istext: 366 # unicode escape (exactly eight digits) 367 escape += source.getwhile(8, HEXDIGITS) 368 if len(escape) != 10: 369 raise source.error("incomplete escape %s" % escape, len(escape)) 370 c = int(escape[2:], 16) 371 chr(c) # raise ValueError for invalid code 372 return LITERAL, c 373 elif c == "0": 374 # octal escape 375 escape += source.getwhile(2, OCTDIGITS) 376 return LITERAL, int(escape[1:], 8) 377 elif c in DIGITS: 378 # octal escape *or* decimal group reference (sigh) 379 if source.next in DIGITS: 380 escape += source.get() 381 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 382 source.next in OCTDIGITS): 383 # got three octal digits; this is an octal escape 384 escape += source.get() 385 c = int(escape[1:], 8) 386 if c > 0o377: 387 raise source.error('octal escape value %s outside of ' 388 'range 0-0o377' % escape, 389 len(escape)) 390 return LITERAL, c 391 # not an octal escape, so this is a group reference 392 group = int(escape[1:]) 393 if group < state.groups: 394 if not state.checkgroup(group): 395 raise source.error("cannot refer to an open group", 396 len(escape)) 397 state.checklookbehindgroup(group, source) 398 return GROUPREF, group 399 raise source.error("invalid group reference %d" % group, len(escape) - 1) 400 if len(escape) == 2: 401 if c in ASCIILETTERS: 402 raise source.error("bad escape %s" % escape, len(escape)) 403 return LITERAL, ord(escape[1]) 404 except ValueError: 405 pass 406 raise source.error("bad escape %s" % escape, len(escape)) 407 408def _uniq(items): 409 if len(set(items)) == len(items): 410 return items 411 newitems = [] 412 for item in items: 413 if item not in newitems: 414 newitems.append(item) 415 return newitems 416 417def _parse_sub(source, state, verbose, nested): 418 # parse an alternation: a|b|c 419 420 items = [] 421 itemsappend = items.append 422 sourcematch = source.match 423 start = source.tell() 424 while True: 425 itemsappend(_parse(source, state, verbose, nested + 1, 426 not nested and not items)) 427 if not sourcematch("|"): 428 break 429 430 if len(items) == 1: 431 return items[0] 432 433 subpattern = SubPattern(state) 434 435 # check if all items share a common prefix 436 while True: 437 prefix = None 438 for item in items: 439 if not item: 440 break 441 if prefix is None: 442 prefix = item[0] 443 elif item[0] != prefix: 444 break 445 else: 446 # all subitems start with a common "prefix". 447 # move it out of the branch 448 for item in items: 449 del item[0] 450 subpattern.append(prefix) 451 continue # check next one 452 break 453 454 # check if the branch can be replaced by a character set 455 set = [] 456 for item in items: 457 if len(item) != 1: 458 break 459 op, av = item[0] 460 if op is LITERAL: 461 set.append((op, av)) 462 elif op is IN and av[0][0] is not NEGATE: 463 set.extend(av) 464 else: 465 break 466 else: 467 # we can store this as a character set instead of a 468 # branch (the compiler may optimize this even more) 469 subpattern.append((IN, _uniq(set))) 470 return subpattern 471 472 subpattern.append((BRANCH, (None, items))) 473 return subpattern 474 475def _parse(source, state, verbose, nested, first=False): 476 # parse a simple pattern 477 subpattern = SubPattern(state) 478 479 # precompute constants into local variables 480 subpatternappend = subpattern.append 481 sourceget = source.get 482 sourcematch = source.match 483 _len = len 484 _ord = ord 485 486 while True: 487 488 this = source.next 489 if this is None: 490 break # end of pattern 491 if this in "|)": 492 break # end of subpattern 493 sourceget() 494 495 if verbose: 496 # skip whitespace and comments 497 if this in WHITESPACE: 498 continue 499 if this == "#": 500 while True: 501 this = sourceget() 502 if this is None or this == "\n": 503 break 504 continue 505 506 if this[0] == "\\": 507 code = _escape(source, this, state) 508 subpatternappend(code) 509 510 elif this not in SPECIAL_CHARS: 511 subpatternappend((LITERAL, _ord(this))) 512 513 elif this == "[": 514 here = source.tell() - 1 515 # character set 516 set = [] 517 setappend = set.append 518## if sourcematch(":"): 519## pass # handle character classes 520 if source.next == '[': 521 import warnings 522 warnings.warn( 523 'Possible nested set at position %d' % source.tell(), 524 FutureWarning, stacklevel=nested + 6 525 ) 526 negate = sourcematch("^") 527 # check remaining characters 528 while True: 529 this = sourceget() 530 if this is None: 531 raise source.error("unterminated character set", 532 source.tell() - here) 533 if this == "]" and set: 534 break 535 elif this[0] == "\\": 536 code1 = _class_escape(source, this) 537 else: 538 if set and this in '-&~|' and source.next == this: 539 import warnings 540 warnings.warn( 541 'Possible set %s at position %d' % ( 542 'difference' if this == '-' else 543 'intersection' if this == '&' else 544 'symmetric difference' if this == '~' else 545 'union', 546 source.tell() - 1), 547 FutureWarning, stacklevel=nested + 6 548 ) 549 code1 = LITERAL, _ord(this) 550 if sourcematch("-"): 551 # potential range 552 that = sourceget() 553 if that is None: 554 raise source.error("unterminated character set", 555 source.tell() - here) 556 if that == "]": 557 if code1[0] is IN: 558 code1 = code1[1][0] 559 setappend(code1) 560 setappend((LITERAL, _ord("-"))) 561 break 562 if that[0] == "\\": 563 code2 = _class_escape(source, that) 564 else: 565 if that == '-': 566 import warnings 567 warnings.warn( 568 'Possible set difference at position %d' % ( 569 source.tell() - 2), 570 FutureWarning, stacklevel=nested + 6 571 ) 572 code2 = LITERAL, _ord(that) 573 if code1[0] != LITERAL or code2[0] != LITERAL: 574 msg = "bad character range %s-%s" % (this, that) 575 raise source.error(msg, len(this) + 1 + len(that)) 576 lo = code1[1] 577 hi = code2[1] 578 if hi < lo: 579 msg = "bad character range %s-%s" % (this, that) 580 raise source.error(msg, len(this) + 1 + len(that)) 581 setappend((RANGE, (lo, hi))) 582 else: 583 if code1[0] is IN: 584 code1 = code1[1][0] 585 setappend(code1) 586 587 set = _uniq(set) 588 # XXX: <fl> should move set optimization to compiler! 589 if _len(set) == 1 and set[0][0] is LITERAL: 590 # optimization 591 if negate: 592 subpatternappend((NOT_LITERAL, set[0][1])) 593 else: 594 subpatternappend(set[0]) 595 else: 596 if negate: 597 set.insert(0, (NEGATE, None)) 598 # charmap optimization can't be added here because 599 # global flags still are not known 600 subpatternappend((IN, set)) 601 602 elif this in REPEAT_CHARS: 603 # repeat previous item 604 here = source.tell() 605 if this == "?": 606 min, max = 0, 1 607 elif this == "*": 608 min, max = 0, MAXREPEAT 609 610 elif this == "+": 611 min, max = 1, MAXREPEAT 612 elif this == "{": 613 if source.next == "}": 614 subpatternappend((LITERAL, _ord(this))) 615 continue 616 617 min, max = 0, MAXREPEAT 618 lo = hi = "" 619 while source.next in DIGITS: 620 lo += sourceget() 621 if sourcematch(","): 622 while source.next in DIGITS: 623 hi += sourceget() 624 else: 625 hi = lo 626 if not sourcematch("}"): 627 subpatternappend((LITERAL, _ord(this))) 628 source.seek(here) 629 continue 630 631 if lo: 632 min = int(lo) 633 if min >= MAXREPEAT: 634 raise OverflowError("the repetition number is too large") 635 if hi: 636 max = int(hi) 637 if max >= MAXREPEAT: 638 raise OverflowError("the repetition number is too large") 639 if max < min: 640 raise source.error("min repeat greater than max repeat", 641 source.tell() - here) 642 else: 643 raise AssertionError("unsupported quantifier %r" % (char,)) 644 # figure out which item to repeat 645 if subpattern: 646 item = subpattern[-1:] 647 else: 648 item = None 649 if not item or item[0][0] is AT: 650 raise source.error("nothing to repeat", 651 source.tell() - here + len(this)) 652 if item[0][0] in _REPEATCODES: 653 raise source.error("multiple repeat", 654 source.tell() - here + len(this)) 655 if item[0][0] is SUBPATTERN: 656 group, add_flags, del_flags, p = item[0][1] 657 if group is None and not add_flags and not del_flags: 658 item = p 659 if sourcematch("?"): 660 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 661 else: 662 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 663 664 elif this == ".": 665 subpatternappend((ANY, None)) 666 667 elif this == "(": 668 start = source.tell() - 1 669 group = True 670 name = None 671 add_flags = 0 672 del_flags = 0 673 if sourcematch("?"): 674 # options 675 char = sourceget() 676 if char is None: 677 raise source.error("unexpected end of pattern") 678 if char == "P": 679 # python extensions 680 if sourcematch("<"): 681 # named group: skip forward to end of name 682 name = source.getuntil(">") 683 if not name.isidentifier(): 684 msg = "bad character in group name %r" % name 685 raise source.error(msg, len(name) + 1) 686 elif sourcematch("="): 687 # named backreference 688 name = source.getuntil(")") 689 if not name.isidentifier(): 690 msg = "bad character in group name %r" % name 691 raise source.error(msg, len(name) + 1) 692 gid = state.groupdict.get(name) 693 if gid is None: 694 msg = "unknown group name %r" % name 695 raise source.error(msg, len(name) + 1) 696 if not state.checkgroup(gid): 697 raise source.error("cannot refer to an open group", 698 len(name) + 1) 699 state.checklookbehindgroup(gid, source) 700 subpatternappend((GROUPREF, gid)) 701 continue 702 703 else: 704 char = sourceget() 705 if char is None: 706 raise source.error("unexpected end of pattern") 707 raise source.error("unknown extension ?P" + char, 708 len(char) + 2) 709 elif char == ":": 710 # non-capturing group 711 group = None 712 elif char == "#": 713 # comment 714 while True: 715 if source.next is None: 716 raise source.error("missing ), unterminated comment", 717 source.tell() - start) 718 if sourceget() == ")": 719 break 720 continue 721 722 elif char in "=!<": 723 # lookahead assertions 724 dir = 1 725 if char == "<": 726 char = sourceget() 727 if char is None: 728 raise source.error("unexpected end of pattern") 729 if char not in "=!": 730 raise source.error("unknown extension ?<" + char, 731 len(char) + 2) 732 dir = -1 # lookbehind 733 lookbehindgroups = state.lookbehindgroups 734 if lookbehindgroups is None: 735 state.lookbehindgroups = state.groups 736 p = _parse_sub(source, state, verbose, nested + 1) 737 if dir < 0: 738 if lookbehindgroups is None: 739 state.lookbehindgroups = None 740 if not sourcematch(")"): 741 raise source.error("missing ), unterminated subpattern", 742 source.tell() - start) 743 if char == "=": 744 subpatternappend((ASSERT, (dir, p))) 745 else: 746 subpatternappend((ASSERT_NOT, (dir, p))) 747 continue 748 749 elif char == "(": 750 # conditional backreference group 751 condname = source.getuntil(")") 752 if condname.isidentifier(): 753 condgroup = state.groupdict.get(condname) 754 if condgroup is None: 755 msg = "unknown group name %r" % condname 756 raise source.error(msg, len(condname) + 1) 757 else: 758 try: 759 condgroup = int(condname) 760 if condgroup < 0: 761 raise ValueError 762 except ValueError: 763 msg = "bad character in group name %r" % condname 764 raise source.error(msg, len(condname) + 1) from None 765 if not condgroup: 766 raise source.error("bad group number", 767 len(condname) + 1) 768 if condgroup >= MAXGROUPS: 769 msg = "invalid group reference %d" % condgroup 770 raise source.error(msg, len(condname) + 1) 771 state.checklookbehindgroup(condgroup, source) 772 item_yes = _parse(source, state, verbose, nested + 1) 773 if source.match("|"): 774 item_no = _parse(source, state, verbose, nested + 1) 775 if source.next == "|": 776 raise source.error("conditional backref with more than two branches") 777 else: 778 item_no = None 779 if not source.match(")"): 780 raise source.error("missing ), unterminated subpattern", 781 source.tell() - start) 782 subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 783 continue 784 785 elif char in FLAGS or char == "-": 786 # flags 787 flags = _parse_flags(source, state, char) 788 if flags is None: # global flags 789 if not first or subpattern: 790 import warnings 791 warnings.warn( 792 'Flags not at the start of the expression %r%s' % ( 793 source.string[:20], # truncate long regexes 794 ' (truncated)' if len(source.string) > 20 else '', 795 ), 796 DeprecationWarning, stacklevel=nested + 6 797 ) 798 if (state.flags & SRE_FLAG_VERBOSE) and not verbose: 799 raise Verbose 800 continue 801 802 add_flags, del_flags = flags 803 group = None 804 else: 805 raise source.error("unknown extension ?" + char, 806 len(char) + 1) 807 808 # parse group contents 809 if group is not None: 810 try: 811 group = state.opengroup(name) 812 except error as err: 813 raise source.error(err.msg, len(name) + 1) from None 814 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and 815 not (del_flags & SRE_FLAG_VERBOSE)) 816 p = _parse_sub(source, state, sub_verbose, nested + 1) 817 if not source.match(")"): 818 raise source.error("missing ), unterminated subpattern", 819 source.tell() - start) 820 if group is not None: 821 state.closegroup(group, p) 822 subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) 823 824 elif this == "^": 825 subpatternappend((AT, AT_BEGINNING)) 826 827 elif this == "$": 828 subpatternappend((AT, AT_END)) 829 830 else: 831 raise AssertionError("unsupported special character %r" % (char,)) 832 833 # unpack non-capturing groups 834 for i in range(len(subpattern))[::-1]: 835 op, av = subpattern[i] 836 if op is SUBPATTERN: 837 group, add_flags, del_flags, p = av 838 if group is None and not add_flags and not del_flags: 839 subpattern[i: i+1] = p 840 841 return subpattern 842 843def _parse_flags(source, state, char): 844 sourceget = source.get 845 add_flags = 0 846 del_flags = 0 847 if char != "-": 848 while True: 849 flag = FLAGS[char] 850 if source.istext: 851 if char == 'L': 852 msg = "bad inline flags: cannot use 'L' flag with a str pattern" 853 raise source.error(msg) 854 else: 855 if char == 'u': 856 msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" 857 raise source.error(msg) 858 add_flags |= flag 859 if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: 860 msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" 861 raise source.error(msg) 862 char = sourceget() 863 if char is None: 864 raise source.error("missing -, : or )") 865 if char in ")-:": 866 break 867 if char not in FLAGS: 868 msg = "unknown flag" if char.isalpha() else "missing -, : or )" 869 raise source.error(msg, len(char)) 870 if char == ")": 871 state.flags |= add_flags 872 return None 873 if add_flags & GLOBAL_FLAGS: 874 raise source.error("bad inline flags: cannot turn on global flag", 1) 875 if char == "-": 876 char = sourceget() 877 if char is None: 878 raise source.error("missing flag") 879 if char not in FLAGS: 880 msg = "unknown flag" if char.isalpha() else "missing flag" 881 raise source.error(msg, len(char)) 882 while True: 883 flag = FLAGS[char] 884 if flag & TYPE_FLAGS: 885 msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" 886 raise source.error(msg) 887 del_flags |= flag 888 char = sourceget() 889 if char is None: 890 raise source.error("missing :") 891 if char == ":": 892 break 893 if char not in FLAGS: 894 msg = "unknown flag" if char.isalpha() else "missing :" 895 raise source.error(msg, len(char)) 896 assert char == ":" 897 if del_flags & GLOBAL_FLAGS: 898 raise source.error("bad inline flags: cannot turn off global flag", 1) 899 if add_flags & del_flags: 900 raise source.error("bad inline flags: flag turned on and off", 1) 901 return add_flags, del_flags 902 903def fix_flags(src, flags): 904 # Check and fix flags according to the type of pattern (str or bytes) 905 if isinstance(src, str): 906 if flags & SRE_FLAG_LOCALE: 907 raise ValueError("cannot use LOCALE flag with a str pattern") 908 if not flags & SRE_FLAG_ASCII: 909 flags |= SRE_FLAG_UNICODE 910 elif flags & SRE_FLAG_UNICODE: 911 raise ValueError("ASCII and UNICODE flags are incompatible") 912 else: 913 if flags & SRE_FLAG_UNICODE: 914 raise ValueError("cannot use UNICODE flag with a bytes pattern") 915 if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: 916 raise ValueError("ASCII and LOCALE flags are incompatible") 917 return flags 918 919def parse(str, flags=0, pattern=None): 920 # parse 're' pattern into list of (opcode, argument) tuples 921 922 source = Tokenizer(str) 923 924 if pattern is None: 925 pattern = Pattern() 926 pattern.flags = flags 927 pattern.str = str 928 929 try: 930 p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0) 931 except Verbose: 932 # the VERBOSE flag was switched on inside the pattern. to be 933 # on the safe side, we'll parse the whole thing again... 934 pattern = Pattern() 935 pattern.flags = flags | SRE_FLAG_VERBOSE 936 pattern.str = str 937 source.seek(0) 938 p = _parse_sub(source, pattern, True, 0) 939 940 p.pattern.flags = fix_flags(str, p.pattern.flags) 941 942 if source.next is not None: 943 assert source.next == ")" 944 raise source.error("unbalanced parenthesis") 945 946 if flags & SRE_FLAG_DEBUG: 947 p.dump() 948 949 return p 950 951def parse_template(source, pattern): 952 # parse 're' replacement string into list of literals and 953 # group references 954 s = Tokenizer(source) 955 sget = s.get 956 groups = [] 957 literals = [] 958 literal = [] 959 lappend = literal.append 960 def addgroup(index, pos): 961 if index > pattern.groups: 962 raise s.error("invalid group reference %d" % index, pos) 963 if literal: 964 literals.append(''.join(literal)) 965 del literal[:] 966 groups.append((len(literals), index)) 967 literals.append(None) 968 groupindex = pattern.groupindex 969 while True: 970 this = sget() 971 if this is None: 972 break # end of replacement string 973 if this[0] == "\\": 974 # group 975 c = this[1] 976 if c == "g": 977 name = "" 978 if not s.match("<"): 979 raise s.error("missing <") 980 name = s.getuntil(">") 981 if name.isidentifier(): 982 try: 983 index = groupindex[name] 984 except KeyError: 985 raise IndexError("unknown group name %r" % name) 986 else: 987 try: 988 index = int(name) 989 if index < 0: 990 raise ValueError 991 except ValueError: 992 raise s.error("bad character in group name %r" % name, 993 len(name) + 1) from None 994 if index >= MAXGROUPS: 995 raise s.error("invalid group reference %d" % index, 996 len(name) + 1) 997 addgroup(index, len(name) + 1) 998 elif c == "0": 999 if s.next in OCTDIGITS: 1000 this += sget() 1001 if s.next in OCTDIGITS: 1002 this += sget() 1003 lappend(chr(int(this[1:], 8) & 0xff)) 1004 elif c in DIGITS: 1005 isoctal = False 1006 if s.next in DIGITS: 1007 this += sget() 1008 if (c in OCTDIGITS and this[2] in OCTDIGITS and 1009 s.next in OCTDIGITS): 1010 this += sget() 1011 isoctal = True 1012 c = int(this[1:], 8) 1013 if c > 0o377: 1014 raise s.error('octal escape value %s outside of ' 1015 'range 0-0o377' % this, len(this)) 1016 lappend(chr(c)) 1017 if not isoctal: 1018 addgroup(int(this[1:]), len(this) - 1) 1019 else: 1020 try: 1021 this = chr(ESCAPES[this][1]) 1022 except KeyError: 1023 if c in ASCIILETTERS: 1024 raise s.error('bad escape %s' % this, len(this)) 1025 lappend(this) 1026 else: 1027 lappend(this) 1028 if literal: 1029 literals.append(''.join(literal)) 1030 if not isinstance(source, str): 1031 # The tokenizer implicitly decodes bytes objects as latin-1, we must 1032 # therefore re-encode the final representation. 1033 literals = [None if s is None else s.encode('latin-1') for s in literals] 1034 return groups, literals 1035 1036def expand_template(template, match): 1037 g = match.group 1038 empty = match.string[:0] 1039 groups, literals = template 1040 literals = literals[:] 1041 try: 1042 for index, group in groups: 1043 literals[index] = g(group) or empty 1044 except IndexError: 1045 raise error("invalid group reference %d" % index) 1046 return empty.join(literals) 1047