1#
2# Secret Labs' Regular Expression Engine
3#
4# convert re-style regular expression to sre pattern
5#
6# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13# XXX: show string offset and offending character for all errors
14
15from sre_constants import *
16
17SPECIAL_CHARS = ".\\[{()*+?^$|"
18REPEAT_CHARS = "*+?{"
19
20DIGITS = frozenset("0123456789")
21
22OCTDIGITS = frozenset("01234567")
23HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25
26WHITESPACE = frozenset(" \t\n\r\v\f")
27
28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30
31ESCAPES = {
32    r"\a": (LITERAL, ord("\a")),
33    r"\b": (LITERAL, ord("\b")),
34    r"\f": (LITERAL, ord("\f")),
35    r"\n": (LITERAL, ord("\n")),
36    r"\r": (LITERAL, ord("\r")),
37    r"\t": (LITERAL, ord("\t")),
38    r"\v": (LITERAL, ord("\v")),
39    r"\\": (LITERAL, ord("\\"))
40}
41
42CATEGORIES = {
43    r"\A": (AT, AT_BEGINNING_STRING), # start of string
44    r"\b": (AT, AT_BOUNDARY),
45    r"\B": (AT, AT_NON_BOUNDARY),
46    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52    r"\Z": (AT, AT_END_STRING), # end of string
53}
54
55FLAGS = {
56    # standard flags
57    "i": SRE_FLAG_IGNORECASE,
58    "L": SRE_FLAG_LOCALE,
59    "m": SRE_FLAG_MULTILINE,
60    "s": SRE_FLAG_DOTALL,
61    "x": SRE_FLAG_VERBOSE,
62    # extensions
63    "a": SRE_FLAG_ASCII,
64    "t": SRE_FLAG_TEMPLATE,
65    "u": SRE_FLAG_UNICODE,
66}
67
68TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
69GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
70
71class Verbose(Exception):
72    pass
73
74class Pattern:
75    # master pattern object.  keeps track of global attributes
76    def __init__(self):
77        self.flags = 0
78        self.groupdict = {}
79        self.groupwidths = [None]  # group 0
80        self.lookbehindgroups = None
81    @property
82    def groups(self):
83        return len(self.groupwidths)
84    def opengroup(self, name=None):
85        gid = self.groups
86        self.groupwidths.append(None)
87        if self.groups > MAXGROUPS:
88            raise error("too many groups")
89        if name is not None:
90            ogid = self.groupdict.get(name, None)
91            if ogid is not None:
92                raise error("redefinition of group name %r as group %d; "
93                            "was group %d" % (name, gid,  ogid))
94            self.groupdict[name] = gid
95        return gid
96    def closegroup(self, gid, p):
97        self.groupwidths[gid] = p.getwidth()
98    def checkgroup(self, gid):
99        return gid < self.groups and self.groupwidths[gid] is not None
100
101    def checklookbehindgroup(self, gid, source):
102        if self.lookbehindgroups is not None:
103            if not self.checkgroup(gid):
104                raise source.error('cannot refer to an open group')
105            if gid >= self.lookbehindgroups:
106                raise source.error('cannot refer to group defined in the same '
107                                   'lookbehind subpattern')
108
109class SubPattern:
110    # a subpattern, in intermediate form
111    def __init__(self, pattern, data=None):
112        self.pattern = pattern
113        if data is None:
114            data = []
115        self.data = data
116        self.width = None
117
118    def dump(self, level=0):
119        nl = True
120        seqtypes = (tuple, list)
121        for op, av in self.data:
122            print(level*"  " + str(op), end='')
123            if op is IN:
124                # member sublanguage
125                print()
126                for op, a in av:
127                    print((level+1)*"  " + str(op), a)
128            elif op is BRANCH:
129                print()
130                for i, a in enumerate(av[1]):
131                    if i:
132                        print(level*"  " + "OR")
133                    a.dump(level+1)
134            elif op is GROUPREF_EXISTS:
135                condgroup, item_yes, item_no = av
136                print('', condgroup)
137                item_yes.dump(level+1)
138                if item_no:
139                    print(level*"  " + "ELSE")
140                    item_no.dump(level+1)
141            elif isinstance(av, seqtypes):
142                nl = False
143                for a in av:
144                    if isinstance(a, SubPattern):
145                        if not nl:
146                            print()
147                        a.dump(level+1)
148                        nl = True
149                    else:
150                        if not nl:
151                            print(' ', end='')
152                        print(a, end='')
153                        nl = False
154                if not nl:
155                    print()
156            else:
157                print('', av)
158    def __repr__(self):
159        return repr(self.data)
160    def __len__(self):
161        return len(self.data)
162    def __delitem__(self, index):
163        del self.data[index]
164    def __getitem__(self, index):
165        if isinstance(index, slice):
166            return SubPattern(self.pattern, self.data[index])
167        return self.data[index]
168    def __setitem__(self, index, code):
169        self.data[index] = code
170    def insert(self, index, code):
171        self.data.insert(index, code)
172    def append(self, code):
173        self.data.append(code)
174    def getwidth(self):
175        # determine the width (min, max) for this subpattern
176        if self.width is not None:
177            return self.width
178        lo = hi = 0
179        for op, av in self.data:
180            if op is BRANCH:
181                i = MAXREPEAT - 1
182                j = 0
183                for av in av[1]:
184                    l, h = av.getwidth()
185                    i = min(i, l)
186                    j = max(j, h)
187                lo = lo + i
188                hi = hi + j
189            elif op is CALL:
190                i, j = av.getwidth()
191                lo = lo + i
192                hi = hi + j
193            elif op is SUBPATTERN:
194                i, j = av[-1].getwidth()
195                lo = lo + i
196                hi = hi + j
197            elif op in _REPEATCODES:
198                i, j = av[2].getwidth()
199                lo = lo + i * av[0]
200                hi = hi + j * av[1]
201            elif op in _UNITCODES:
202                lo = lo + 1
203                hi = hi + 1
204            elif op is GROUPREF:
205                i, j = self.pattern.groupwidths[av]
206                lo = lo + i
207                hi = hi + j
208            elif op is GROUPREF_EXISTS:
209                i, j = av[1].getwidth()
210                if av[2] is not None:
211                    l, h = av[2].getwidth()
212                    i = min(i, l)
213                    j = max(j, h)
214                else:
215                    i = 0
216                lo = lo + i
217                hi = hi + j
218            elif op is SUCCESS:
219                break
220        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
221        return self.width
222
223class Tokenizer:
224    def __init__(self, string):
225        self.istext = isinstance(string, str)
226        self.string = string
227        if not self.istext:
228            string = str(string, 'latin1')
229        self.decoded_string = string
230        self.index = 0
231        self.next = None
232        self.__next()
233    def __next(self):
234        index = self.index
235        try:
236            char = self.decoded_string[index]
237        except IndexError:
238            self.next = None
239            return
240        if char == "\\":
241            index += 1
242            try:
243                char += self.decoded_string[index]
244            except IndexError:
245                raise error("bad escape (end of pattern)",
246                            self.string, len(self.string) - 1) from None
247        self.index = index + 1
248        self.next = char
249    def match(self, char):
250        if char == self.next:
251            self.__next()
252            return True
253        return False
254    def get(self):
255        this = self.next
256        self.__next()
257        return this
258    def getwhile(self, n, charset):
259        result = ''
260        for _ in range(n):
261            c = self.next
262            if c not in charset:
263                break
264            result += c
265            self.__next()
266        return result
267    def getuntil(self, terminator):
268        result = ''
269        while True:
270            c = self.next
271            self.__next()
272            if c is None:
273                if not result:
274                    raise self.error("missing group name")
275                raise self.error("missing %s, unterminated name" % terminator,
276                                 len(result))
277            if c == terminator:
278                if not result:
279                    raise self.error("missing group name", 1)
280                break
281            result += c
282        return result
283    @property
284    def pos(self):
285        return self.index - len(self.next or '')
286    def tell(self):
287        return self.index - len(self.next or '')
288    def seek(self, index):
289        self.index = index
290        self.__next()
291
292    def error(self, msg, offset=0):
293        return error(msg, self.string, self.tell() - offset)
294
295def _class_escape(source, escape):
296    # handle escape code inside character class
297    code = ESCAPES.get(escape)
298    if code:
299        return code
300    code = CATEGORIES.get(escape)
301    if code and code[0] is IN:
302        return code
303    try:
304        c = escape[1:2]
305        if c == "x":
306            # hexadecimal escape (exactly two digits)
307            escape += source.getwhile(2, HEXDIGITS)
308            if len(escape) != 4:
309                raise source.error("incomplete escape %s" % escape, len(escape))
310            return LITERAL, int(escape[2:], 16)
311        elif c == "u" and source.istext:
312            # unicode escape (exactly four digits)
313            escape += source.getwhile(4, HEXDIGITS)
314            if len(escape) != 6:
315                raise source.error("incomplete escape %s" % escape, len(escape))
316            return LITERAL, int(escape[2:], 16)
317        elif c == "U" and source.istext:
318            # unicode escape (exactly eight digits)
319            escape += source.getwhile(8, HEXDIGITS)
320            if len(escape) != 10:
321                raise source.error("incomplete escape %s" % escape, len(escape))
322            c = int(escape[2:], 16)
323            chr(c) # raise ValueError for invalid code
324            return LITERAL, c
325        elif c in OCTDIGITS:
326            # octal escape (up to three digits)
327            escape += source.getwhile(2, OCTDIGITS)
328            c = int(escape[1:], 8)
329            if c > 0o377:
330                raise source.error('octal escape value %s outside of '
331                                   'range 0-0o377' % escape, len(escape))
332            return LITERAL, c
333        elif c in DIGITS:
334            raise ValueError
335        if len(escape) == 2:
336            if c in ASCIILETTERS:
337                raise source.error('bad escape %s' % escape, len(escape))
338            return LITERAL, ord(escape[1])
339    except ValueError:
340        pass
341    raise source.error("bad escape %s" % escape, len(escape))
342
343def _escape(source, escape, state):
344    # handle escape code in expression
345    code = CATEGORIES.get(escape)
346    if code:
347        return code
348    code = ESCAPES.get(escape)
349    if code:
350        return code
351    try:
352        c = escape[1:2]
353        if c == "x":
354            # hexadecimal escape
355            escape += source.getwhile(2, HEXDIGITS)
356            if len(escape) != 4:
357                raise source.error("incomplete escape %s" % escape, len(escape))
358            return LITERAL, int(escape[2:], 16)
359        elif c == "u" and source.istext:
360            # unicode escape (exactly four digits)
361            escape += source.getwhile(4, HEXDIGITS)
362            if len(escape) != 6:
363                raise source.error("incomplete escape %s" % escape, len(escape))
364            return LITERAL, int(escape[2:], 16)
365        elif c == "U" and source.istext:
366            # unicode escape (exactly eight digits)
367            escape += source.getwhile(8, HEXDIGITS)
368            if len(escape) != 10:
369                raise source.error("incomplete escape %s" % escape, len(escape))
370            c = int(escape[2:], 16)
371            chr(c) # raise ValueError for invalid code
372            return LITERAL, c
373        elif c == "0":
374            # octal escape
375            escape += source.getwhile(2, OCTDIGITS)
376            return LITERAL, int(escape[1:], 8)
377        elif c in DIGITS:
378            # octal escape *or* decimal group reference (sigh)
379            if source.next in DIGITS:
380                escape += source.get()
381                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
382                    source.next in OCTDIGITS):
383                    # got three octal digits; this is an octal escape
384                    escape += source.get()
385                    c = int(escape[1:], 8)
386                    if c > 0o377:
387                        raise source.error('octal escape value %s outside of '
388                                           'range 0-0o377' % escape,
389                                           len(escape))
390                    return LITERAL, c
391            # not an octal escape, so this is a group reference
392            group = int(escape[1:])
393            if group < state.groups:
394                if not state.checkgroup(group):
395                    raise source.error("cannot refer to an open group",
396                                       len(escape))
397                state.checklookbehindgroup(group, source)
398                return GROUPREF, group
399            raise source.error("invalid group reference %d" % group, len(escape) - 1)
400        if len(escape) == 2:
401            if c in ASCIILETTERS:
402                raise source.error("bad escape %s" % escape, len(escape))
403            return LITERAL, ord(escape[1])
404    except ValueError:
405        pass
406    raise source.error("bad escape %s" % escape, len(escape))
407
408def _uniq(items):
409    if len(set(items)) == len(items):
410        return items
411    newitems = []
412    for item in items:
413        if item not in newitems:
414            newitems.append(item)
415    return newitems
416
417def _parse_sub(source, state, verbose, nested):
418    # parse an alternation: a|b|c
419
420    items = []
421    itemsappend = items.append
422    sourcematch = source.match
423    start = source.tell()
424    while True:
425        itemsappend(_parse(source, state, verbose, nested + 1,
426                           not nested and not items))
427        if not sourcematch("|"):
428            break
429
430    if len(items) == 1:
431        return items[0]
432
433    subpattern = SubPattern(state)
434
435    # check if all items share a common prefix
436    while True:
437        prefix = None
438        for item in items:
439            if not item:
440                break
441            if prefix is None:
442                prefix = item[0]
443            elif item[0] != prefix:
444                break
445        else:
446            # all subitems start with a common "prefix".
447            # move it out of the branch
448            for item in items:
449                del item[0]
450            subpattern.append(prefix)
451            continue # check next one
452        break
453
454    # check if the branch can be replaced by a character set
455    set = []
456    for item in items:
457        if len(item) != 1:
458            break
459        op, av = item[0]
460        if op is LITERAL:
461            set.append((op, av))
462        elif op is IN and av[0][0] is not NEGATE:
463            set.extend(av)
464        else:
465            break
466    else:
467        # we can store this as a character set instead of a
468        # branch (the compiler may optimize this even more)
469        subpattern.append((IN, _uniq(set)))
470        return subpattern
471
472    subpattern.append((BRANCH, (None, items)))
473    return subpattern
474
475def _parse(source, state, verbose, nested, first=False):
476    # parse a simple pattern
477    subpattern = SubPattern(state)
478
479    # precompute constants into local variables
480    subpatternappend = subpattern.append
481    sourceget = source.get
482    sourcematch = source.match
483    _len = len
484    _ord = ord
485
486    while True:
487
488        this = source.next
489        if this is None:
490            break # end of pattern
491        if this in "|)":
492            break # end of subpattern
493        sourceget()
494
495        if verbose:
496            # skip whitespace and comments
497            if this in WHITESPACE:
498                continue
499            if this == "#":
500                while True:
501                    this = sourceget()
502                    if this is None or this == "\n":
503                        break
504                continue
505
506        if this[0] == "\\":
507            code = _escape(source, this, state)
508            subpatternappend(code)
509
510        elif this not in SPECIAL_CHARS:
511            subpatternappend((LITERAL, _ord(this)))
512
513        elif this == "[":
514            here = source.tell() - 1
515            # character set
516            set = []
517            setappend = set.append
518##          if sourcematch(":"):
519##              pass # handle character classes
520            if source.next == '[':
521                import warnings
522                warnings.warn(
523                    'Possible nested set at position %d' % source.tell(),
524                    FutureWarning, stacklevel=nested + 6
525                )
526            negate = sourcematch("^")
527            # check remaining characters
528            while True:
529                this = sourceget()
530                if this is None:
531                    raise source.error("unterminated character set",
532                                       source.tell() - here)
533                if this == "]" and set:
534                    break
535                elif this[0] == "\\":
536                    code1 = _class_escape(source, this)
537                else:
538                    if set and this in '-&~|' and source.next == this:
539                        import warnings
540                        warnings.warn(
541                            'Possible set %s at position %d' % (
542                                'difference' if this == '-' else
543                                'intersection' if this == '&' else
544                                'symmetric difference' if this == '~' else
545                                'union',
546                                source.tell() - 1),
547                            FutureWarning, stacklevel=nested + 6
548                        )
549                    code1 = LITERAL, _ord(this)
550                if sourcematch("-"):
551                    # potential range
552                    that = sourceget()
553                    if that is None:
554                        raise source.error("unterminated character set",
555                                           source.tell() - here)
556                    if that == "]":
557                        if code1[0] is IN:
558                            code1 = code1[1][0]
559                        setappend(code1)
560                        setappend((LITERAL, _ord("-")))
561                        break
562                    if that[0] == "\\":
563                        code2 = _class_escape(source, that)
564                    else:
565                        if that == '-':
566                            import warnings
567                            warnings.warn(
568                                'Possible set difference at position %d' % (
569                                    source.tell() - 2),
570                                FutureWarning, stacklevel=nested + 6
571                            )
572                        code2 = LITERAL, _ord(that)
573                    if code1[0] != LITERAL or code2[0] != LITERAL:
574                        msg = "bad character range %s-%s" % (this, that)
575                        raise source.error(msg, len(this) + 1 + len(that))
576                    lo = code1[1]
577                    hi = code2[1]
578                    if hi < lo:
579                        msg = "bad character range %s-%s" % (this, that)
580                        raise source.error(msg, len(this) + 1 + len(that))
581                    setappend((RANGE, (lo, hi)))
582                else:
583                    if code1[0] is IN:
584                        code1 = code1[1][0]
585                    setappend(code1)
586
587            set = _uniq(set)
588            # XXX: <fl> should move set optimization to compiler!
589            if _len(set) == 1 and set[0][0] is LITERAL:
590                # optimization
591                if negate:
592                    subpatternappend((NOT_LITERAL, set[0][1]))
593                else:
594                    subpatternappend(set[0])
595            else:
596                if negate:
597                    set.insert(0, (NEGATE, None))
598                # charmap optimization can't be added here because
599                # global flags still are not known
600                subpatternappend((IN, set))
601
602        elif this in REPEAT_CHARS:
603            # repeat previous item
604            here = source.tell()
605            if this == "?":
606                min, max = 0, 1
607            elif this == "*":
608                min, max = 0, MAXREPEAT
609
610            elif this == "+":
611                min, max = 1, MAXREPEAT
612            elif this == "{":
613                if source.next == "}":
614                    subpatternappend((LITERAL, _ord(this)))
615                    continue
616
617                min, max = 0, MAXREPEAT
618                lo = hi = ""
619                while source.next in DIGITS:
620                    lo += sourceget()
621                if sourcematch(","):
622                    while source.next in DIGITS:
623                        hi += sourceget()
624                else:
625                    hi = lo
626                if not sourcematch("}"):
627                    subpatternappend((LITERAL, _ord(this)))
628                    source.seek(here)
629                    continue
630
631                if lo:
632                    min = int(lo)
633                    if min >= MAXREPEAT:
634                        raise OverflowError("the repetition number is too large")
635                if hi:
636                    max = int(hi)
637                    if max >= MAXREPEAT:
638                        raise OverflowError("the repetition number is too large")
639                    if max < min:
640                        raise source.error("min repeat greater than max repeat",
641                                           source.tell() - here)
642            else:
643                raise AssertionError("unsupported quantifier %r" % (char,))
644            # figure out which item to repeat
645            if subpattern:
646                item = subpattern[-1:]
647            else:
648                item = None
649            if not item or item[0][0] is AT:
650                raise source.error("nothing to repeat",
651                                   source.tell() - here + len(this))
652            if item[0][0] in _REPEATCODES:
653                raise source.error("multiple repeat",
654                                   source.tell() - here + len(this))
655            if item[0][0] is SUBPATTERN:
656                group, add_flags, del_flags, p = item[0][1]
657                if group is None and not add_flags and not del_flags:
658                    item = p
659            if sourcematch("?"):
660                subpattern[-1] = (MIN_REPEAT, (min, max, item))
661            else:
662                subpattern[-1] = (MAX_REPEAT, (min, max, item))
663
664        elif this == ".":
665            subpatternappend((ANY, None))
666
667        elif this == "(":
668            start = source.tell() - 1
669            group = True
670            name = None
671            add_flags = 0
672            del_flags = 0
673            if sourcematch("?"):
674                # options
675                char = sourceget()
676                if char is None:
677                    raise source.error("unexpected end of pattern")
678                if char == "P":
679                    # python extensions
680                    if sourcematch("<"):
681                        # named group: skip forward to end of name
682                        name = source.getuntil(">")
683                        if not name.isidentifier():
684                            msg = "bad character in group name %r" % name
685                            raise source.error(msg, len(name) + 1)
686                    elif sourcematch("="):
687                        # named backreference
688                        name = source.getuntil(")")
689                        if not name.isidentifier():
690                            msg = "bad character in group name %r" % name
691                            raise source.error(msg, len(name) + 1)
692                        gid = state.groupdict.get(name)
693                        if gid is None:
694                            msg = "unknown group name %r" % name
695                            raise source.error(msg, len(name) + 1)
696                        if not state.checkgroup(gid):
697                            raise source.error("cannot refer to an open group",
698                                               len(name) + 1)
699                        state.checklookbehindgroup(gid, source)
700                        subpatternappend((GROUPREF, gid))
701                        continue
702
703                    else:
704                        char = sourceget()
705                        if char is None:
706                            raise source.error("unexpected end of pattern")
707                        raise source.error("unknown extension ?P" + char,
708                                           len(char) + 2)
709                elif char == ":":
710                    # non-capturing group
711                    group = None
712                elif char == "#":
713                    # comment
714                    while True:
715                        if source.next is None:
716                            raise source.error("missing ), unterminated comment",
717                                               source.tell() - start)
718                        if sourceget() == ")":
719                            break
720                    continue
721
722                elif char in "=!<":
723                    # lookahead assertions
724                    dir = 1
725                    if char == "<":
726                        char = sourceget()
727                        if char is None:
728                            raise source.error("unexpected end of pattern")
729                        if char not in "=!":
730                            raise source.error("unknown extension ?<" + char,
731                                               len(char) + 2)
732                        dir = -1 # lookbehind
733                        lookbehindgroups = state.lookbehindgroups
734                        if lookbehindgroups is None:
735                            state.lookbehindgroups = state.groups
736                    p = _parse_sub(source, state, verbose, nested + 1)
737                    if dir < 0:
738                        if lookbehindgroups is None:
739                            state.lookbehindgroups = None
740                    if not sourcematch(")"):
741                        raise source.error("missing ), unterminated subpattern",
742                                           source.tell() - start)
743                    if char == "=":
744                        subpatternappend((ASSERT, (dir, p)))
745                    else:
746                        subpatternappend((ASSERT_NOT, (dir, p)))
747                    continue
748
749                elif char == "(":
750                    # conditional backreference group
751                    condname = source.getuntil(")")
752                    if condname.isidentifier():
753                        condgroup = state.groupdict.get(condname)
754                        if condgroup is None:
755                            msg = "unknown group name %r" % condname
756                            raise source.error(msg, len(condname) + 1)
757                    else:
758                        try:
759                            condgroup = int(condname)
760                            if condgroup < 0:
761                                raise ValueError
762                        except ValueError:
763                            msg = "bad character in group name %r" % condname
764                            raise source.error(msg, len(condname) + 1) from None
765                        if not condgroup:
766                            raise source.error("bad group number",
767                                               len(condname) + 1)
768                        if condgroup >= MAXGROUPS:
769                            msg = "invalid group reference %d" % condgroup
770                            raise source.error(msg, len(condname) + 1)
771                    state.checklookbehindgroup(condgroup, source)
772                    item_yes = _parse(source, state, verbose, nested + 1)
773                    if source.match("|"):
774                        item_no = _parse(source, state, verbose, nested + 1)
775                        if source.next == "|":
776                            raise source.error("conditional backref with more than two branches")
777                    else:
778                        item_no = None
779                    if not source.match(")"):
780                        raise source.error("missing ), unterminated subpattern",
781                                           source.tell() - start)
782                    subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
783                    continue
784
785                elif char in FLAGS or char == "-":
786                    # flags
787                    flags = _parse_flags(source, state, char)
788                    if flags is None:  # global flags
789                        if not first or subpattern:
790                            import warnings
791                            warnings.warn(
792                                'Flags not at the start of the expression %r%s' % (
793                                    source.string[:20],  # truncate long regexes
794                                    ' (truncated)' if len(source.string) > 20 else '',
795                                ),
796                                DeprecationWarning, stacklevel=nested + 6
797                            )
798                        if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
799                            raise Verbose
800                        continue
801
802                    add_flags, del_flags = flags
803                    group = None
804                else:
805                    raise source.error("unknown extension ?" + char,
806                                       len(char) + 1)
807
808            # parse group contents
809            if group is not None:
810                try:
811                    group = state.opengroup(name)
812                except error as err:
813                    raise source.error(err.msg, len(name) + 1) from None
814            sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
815                           not (del_flags & SRE_FLAG_VERBOSE))
816            p = _parse_sub(source, state, sub_verbose, nested + 1)
817            if not source.match(")"):
818                raise source.error("missing ), unterminated subpattern",
819                                   source.tell() - start)
820            if group is not None:
821                state.closegroup(group, p)
822            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
823
824        elif this == "^":
825            subpatternappend((AT, AT_BEGINNING))
826
827        elif this == "$":
828            subpatternappend((AT, AT_END))
829
830        else:
831            raise AssertionError("unsupported special character %r" % (char,))
832
833    # unpack non-capturing groups
834    for i in range(len(subpattern))[::-1]:
835        op, av = subpattern[i]
836        if op is SUBPATTERN:
837            group, add_flags, del_flags, p = av
838            if group is None and not add_flags and not del_flags:
839                subpattern[i: i+1] = p
840
841    return subpattern
842
843def _parse_flags(source, state, char):
844    sourceget = source.get
845    add_flags = 0
846    del_flags = 0
847    if char != "-":
848        while True:
849            flag = FLAGS[char]
850            if source.istext:
851                if char == 'L':
852                    msg = "bad inline flags: cannot use 'L' flag with a str pattern"
853                    raise source.error(msg)
854            else:
855                if char == 'u':
856                    msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
857                    raise source.error(msg)
858            add_flags |= flag
859            if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
860                msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
861                raise source.error(msg)
862            char = sourceget()
863            if char is None:
864                raise source.error("missing -, : or )")
865            if char in ")-:":
866                break
867            if char not in FLAGS:
868                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
869                raise source.error(msg, len(char))
870    if char == ")":
871        state.flags |= add_flags
872        return None
873    if add_flags & GLOBAL_FLAGS:
874        raise source.error("bad inline flags: cannot turn on global flag", 1)
875    if char == "-":
876        char = sourceget()
877        if char is None:
878            raise source.error("missing flag")
879        if char not in FLAGS:
880            msg = "unknown flag" if char.isalpha() else "missing flag"
881            raise source.error(msg, len(char))
882        while True:
883            flag = FLAGS[char]
884            if flag & TYPE_FLAGS:
885                msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
886                raise source.error(msg)
887            del_flags |= flag
888            char = sourceget()
889            if char is None:
890                raise source.error("missing :")
891            if char == ":":
892                break
893            if char not in FLAGS:
894                msg = "unknown flag" if char.isalpha() else "missing :"
895                raise source.error(msg, len(char))
896    assert char == ":"
897    if del_flags & GLOBAL_FLAGS:
898        raise source.error("bad inline flags: cannot turn off global flag", 1)
899    if add_flags & del_flags:
900        raise source.error("bad inline flags: flag turned on and off", 1)
901    return add_flags, del_flags
902
903def fix_flags(src, flags):
904    # Check and fix flags according to the type of pattern (str or bytes)
905    if isinstance(src, str):
906        if flags & SRE_FLAG_LOCALE:
907            raise ValueError("cannot use LOCALE flag with a str pattern")
908        if not flags & SRE_FLAG_ASCII:
909            flags |= SRE_FLAG_UNICODE
910        elif flags & SRE_FLAG_UNICODE:
911            raise ValueError("ASCII and UNICODE flags are incompatible")
912    else:
913        if flags & SRE_FLAG_UNICODE:
914            raise ValueError("cannot use UNICODE flag with a bytes pattern")
915        if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
916            raise ValueError("ASCII and LOCALE flags are incompatible")
917    return flags
918
919def parse(str, flags=0, pattern=None):
920    # parse 're' pattern into list of (opcode, argument) tuples
921
922    source = Tokenizer(str)
923
924    if pattern is None:
925        pattern = Pattern()
926    pattern.flags = flags
927    pattern.str = str
928
929    try:
930        p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
931    except Verbose:
932        # the VERBOSE flag was switched on inside the pattern.  to be
933        # on the safe side, we'll parse the whole thing again...
934        pattern = Pattern()
935        pattern.flags = flags | SRE_FLAG_VERBOSE
936        pattern.str = str
937        source.seek(0)
938        p = _parse_sub(source, pattern, True, 0)
939
940    p.pattern.flags = fix_flags(str, p.pattern.flags)
941
942    if source.next is not None:
943        assert source.next == ")"
944        raise source.error("unbalanced parenthesis")
945
946    if flags & SRE_FLAG_DEBUG:
947        p.dump()
948
949    return p
950
951def parse_template(source, pattern):
952    # parse 're' replacement string into list of literals and
953    # group references
954    s = Tokenizer(source)
955    sget = s.get
956    groups = []
957    literals = []
958    literal = []
959    lappend = literal.append
960    def addgroup(index, pos):
961        if index > pattern.groups:
962            raise s.error("invalid group reference %d" % index, pos)
963        if literal:
964            literals.append(''.join(literal))
965            del literal[:]
966        groups.append((len(literals), index))
967        literals.append(None)
968    groupindex = pattern.groupindex
969    while True:
970        this = sget()
971        if this is None:
972            break # end of replacement string
973        if this[0] == "\\":
974            # group
975            c = this[1]
976            if c == "g":
977                name = ""
978                if not s.match("<"):
979                    raise s.error("missing <")
980                name = s.getuntil(">")
981                if name.isidentifier():
982                    try:
983                        index = groupindex[name]
984                    except KeyError:
985                        raise IndexError("unknown group name %r" % name)
986                else:
987                    try:
988                        index = int(name)
989                        if index < 0:
990                            raise ValueError
991                    except ValueError:
992                        raise s.error("bad character in group name %r" % name,
993                                      len(name) + 1) from None
994                    if index >= MAXGROUPS:
995                        raise s.error("invalid group reference %d" % index,
996                                      len(name) + 1)
997                addgroup(index, len(name) + 1)
998            elif c == "0":
999                if s.next in OCTDIGITS:
1000                    this += sget()
1001                    if s.next in OCTDIGITS:
1002                        this += sget()
1003                lappend(chr(int(this[1:], 8) & 0xff))
1004            elif c in DIGITS:
1005                isoctal = False
1006                if s.next in DIGITS:
1007                    this += sget()
1008                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
1009                        s.next in OCTDIGITS):
1010                        this += sget()
1011                        isoctal = True
1012                        c = int(this[1:], 8)
1013                        if c > 0o377:
1014                            raise s.error('octal escape value %s outside of '
1015                                          'range 0-0o377' % this, len(this))
1016                        lappend(chr(c))
1017                if not isoctal:
1018                    addgroup(int(this[1:]), len(this) - 1)
1019            else:
1020                try:
1021                    this = chr(ESCAPES[this][1])
1022                except KeyError:
1023                    if c in ASCIILETTERS:
1024                        raise s.error('bad escape %s' % this, len(this))
1025                lappend(this)
1026        else:
1027            lappend(this)
1028    if literal:
1029        literals.append(''.join(literal))
1030    if not isinstance(source, str):
1031        # The tokenizer implicitly decodes bytes objects as latin-1, we must
1032        # therefore re-encode the final representation.
1033        literals = [None if s is None else s.encode('latin-1') for s in literals]
1034    return groups, literals
1035
1036def expand_template(template, match):
1037    g = match.group
1038    empty = match.string[:0]
1039    groups, literals = template
1040    literals = literals[:]
1041    try:
1042        for index, group in groups:
1043            literals[index] = g(group) or empty
1044    except IndexError:
1045        raise error("invalid group reference %d" % index)
1046    return empty.join(literals)
1047