1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre
14import sre_parse
15from sre_constants import *
16
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
24
25# Sets of lowercase characters which have the same uppercase.
26_equivalences = (
27    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
28    (0x69, 0x131), # iı
29    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
30    (0x73, 0x17f), # sſ
31    # MICRO SIGN, GREEK SMALL LETTER MU
32    (0xb5, 0x3bc), # µμ
33    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
34    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
35    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
36    (0x390, 0x1fd3), # ΐΐ
37    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
38    (0x3b0, 0x1fe3), # ΰΰ
39    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
40    (0x3b2, 0x3d0), # βϐ
41    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
42    (0x3b5, 0x3f5), # εϵ
43    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
44    (0x3b8, 0x3d1), # θϑ
45    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
46    (0x3ba, 0x3f0), # κϰ
47    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
48    (0x3c0, 0x3d6), # πϖ
49    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
50    (0x3c1, 0x3f1), # ρϱ
51    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
52    (0x3c2, 0x3c3), # ςσ
53    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
54    (0x3c6, 0x3d5), # φϕ
55    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
56    (0x1e61, 0x1e9b), # ṡẛ
57    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
58    (0xfb05, 0xfb06), # ſtst
59)
60
61# Maps the lowercase code to lowercase codes which have the same uppercase.
62_ignorecase_fixes = {i: tuple(j for j in t if i != j)
63                     for t in _equivalences for i in t}
64
65def _combine_flags(flags, add_flags, del_flags,
66                   TYPE_FLAGS=sre_parse.TYPE_FLAGS):
67    if add_flags & TYPE_FLAGS:
68        flags &= ~TYPE_FLAGS
69    return (flags | add_flags) & ~del_flags
70
71def _compile(code, pattern, flags):
72    # internal: compile a (sub)pattern
73    emit = code.append
74    _len = len
75    LITERAL_CODES = _LITERAL_CODES
76    REPEATING_CODES = _REPEATING_CODES
77    SUCCESS_CODES = _SUCCESS_CODES
78    ASSERT_CODES = _ASSERT_CODES
79    iscased = None
80    tolower = None
81    fixes = None
82    if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
83        if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
84            iscased = _sre.unicode_iscased
85            tolower = _sre.unicode_tolower
86            fixes = _ignorecase_fixes
87        else:
88            iscased = _sre.ascii_iscased
89            tolower = _sre.ascii_tolower
90    for op, av in pattern:
91        if op in LITERAL_CODES:
92            if not flags & SRE_FLAG_IGNORECASE:
93                emit(op)
94                emit(av)
95            elif flags & SRE_FLAG_LOCALE:
96                emit(OP_LOCALE_IGNORE[op])
97                emit(av)
98            elif not iscased(av):
99                emit(op)
100                emit(av)
101            else:
102                lo = tolower(av)
103                if not fixes:  # ascii
104                    emit(OP_IGNORE[op])
105                    emit(lo)
106                elif lo not in fixes:
107                    emit(OP_UNICODE_IGNORE[op])
108                    emit(lo)
109                else:
110                    emit(IN_UNI_IGNORE)
111                    skip = _len(code); emit(0)
112                    if op is NOT_LITERAL:
113                        emit(NEGATE)
114                    for k in (lo,) + fixes[lo]:
115                        emit(LITERAL)
116                        emit(k)
117                    emit(FAILURE)
118                    code[skip] = _len(code) - skip
119        elif op is IN:
120            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
121            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
122                emit(IN_LOC_IGNORE)
123            elif not hascased:
124                emit(IN)
125            elif not fixes:  # ascii
126                emit(IN_IGNORE)
127            else:
128                emit(IN_UNI_IGNORE)
129            skip = _len(code); emit(0)
130            _compile_charset(charset, flags, code)
131            code[skip] = _len(code) - skip
132        elif op is ANY:
133            if flags & SRE_FLAG_DOTALL:
134                emit(ANY_ALL)
135            else:
136                emit(ANY)
137        elif op in REPEATING_CODES:
138            if flags & SRE_FLAG_TEMPLATE:
139                raise error("internal: unsupported template operator %r" % (op,))
140            if _simple(av[2]):
141                if op is MAX_REPEAT:
142                    emit(REPEAT_ONE)
143                else:
144                    emit(MIN_REPEAT_ONE)
145                skip = _len(code); emit(0)
146                emit(av[0])
147                emit(av[1])
148                _compile(code, av[2], flags)
149                emit(SUCCESS)
150                code[skip] = _len(code) - skip
151            else:
152                emit(REPEAT)
153                skip = _len(code); emit(0)
154                emit(av[0])
155                emit(av[1])
156                _compile(code, av[2], flags)
157                code[skip] = _len(code) - skip
158                if op is MAX_REPEAT:
159                    emit(MAX_UNTIL)
160                else:
161                    emit(MIN_UNTIL)
162        elif op is SUBPATTERN:
163            group, add_flags, del_flags, p = av
164            if group:
165                emit(MARK)
166                emit((group-1)*2)
167            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
168            _compile(code, p, _combine_flags(flags, add_flags, del_flags))
169            if group:
170                emit(MARK)
171                emit((group-1)*2+1)
172        elif op in SUCCESS_CODES:
173            emit(op)
174        elif op in ASSERT_CODES:
175            emit(op)
176            skip = _len(code); emit(0)
177            if av[0] >= 0:
178                emit(0) # look ahead
179            else:
180                lo, hi = av[1].getwidth()
181                if lo != hi:
182                    raise error("look-behind requires fixed-width pattern")
183                emit(lo) # look behind
184            _compile(code, av[1], flags)
185            emit(SUCCESS)
186            code[skip] = _len(code) - skip
187        elif op is CALL:
188            emit(op)
189            skip = _len(code); emit(0)
190            _compile(code, av, flags)
191            emit(SUCCESS)
192            code[skip] = _len(code) - skip
193        elif op is AT:
194            emit(op)
195            if flags & SRE_FLAG_MULTILINE:
196                av = AT_MULTILINE.get(av, av)
197            if flags & SRE_FLAG_LOCALE:
198                av = AT_LOCALE.get(av, av)
199            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
200                av = AT_UNICODE.get(av, av)
201            emit(av)
202        elif op is BRANCH:
203            emit(op)
204            tail = []
205            tailappend = tail.append
206            for av in av[1]:
207                skip = _len(code); emit(0)
208                # _compile_info(code, av, flags)
209                _compile(code, av, flags)
210                emit(JUMP)
211                tailappend(_len(code)); emit(0)
212                code[skip] = _len(code) - skip
213            emit(FAILURE) # end of branch
214            for tail in tail:
215                code[tail] = _len(code) - tail
216        elif op is CATEGORY:
217            emit(op)
218            if flags & SRE_FLAG_LOCALE:
219                av = CH_LOCALE[av]
220            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
221                av = CH_UNICODE[av]
222            emit(av)
223        elif op is GROUPREF:
224            if not flags & SRE_FLAG_IGNORECASE:
225                emit(op)
226            elif flags & SRE_FLAG_LOCALE:
227                emit(GROUPREF_LOC_IGNORE)
228            elif not fixes:  # ascii
229                emit(GROUPREF_IGNORE)
230            else:
231                emit(GROUPREF_UNI_IGNORE)
232            emit(av-1)
233        elif op is GROUPREF_EXISTS:
234            emit(op)
235            emit(av[0]-1)
236            skipyes = _len(code); emit(0)
237            _compile(code, av[1], flags)
238            if av[2]:
239                emit(JUMP)
240                skipno = _len(code); emit(0)
241                code[skipyes] = _len(code) - skipyes + 1
242                _compile(code, av[2], flags)
243                code[skipno] = _len(code) - skipno
244            else:
245                code[skipyes] = _len(code) - skipyes + 1
246        else:
247            raise error("internal: unsupported operand type %r" % (op,))
248
249def _compile_charset(charset, flags, code):
250    # compile charset subprogram
251    emit = code.append
252    for op, av in charset:
253        emit(op)
254        if op is NEGATE:
255            pass
256        elif op is LITERAL:
257            emit(av)
258        elif op is RANGE or op is RANGE_UNI_IGNORE:
259            emit(av[0])
260            emit(av[1])
261        elif op is CHARSET:
262            code.extend(av)
263        elif op is BIGCHARSET:
264            code.extend(av)
265        elif op is CATEGORY:
266            if flags & SRE_FLAG_LOCALE:
267                emit(CH_LOCALE[av])
268            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
269                emit(CH_UNICODE[av])
270            else:
271                emit(av)
272        else:
273            raise error("internal: unsupported set operator %r" % (op,))
274    emit(FAILURE)
275
276def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
277    # internal: optimize character set
278    out = []
279    tail = []
280    charmap = bytearray(256)
281    hascased = False
282    for op, av in charset:
283        while True:
284            try:
285                if op is LITERAL:
286                    if fixup:
287                        lo = fixup(av)
288                        charmap[lo] = 1
289                        if fixes and lo in fixes:
290                            for k in fixes[lo]:
291                                charmap[k] = 1
292                        if not hascased and iscased(av):
293                            hascased = True
294                    else:
295                        charmap[av] = 1
296                elif op is RANGE:
297                    r = range(av[0], av[1]+1)
298                    if fixup:
299                        if fixes:
300                            for i in map(fixup, r):
301                                charmap[i] = 1
302                                if i in fixes:
303                                    for k in fixes[i]:
304                                        charmap[k] = 1
305                        else:
306                            for i in map(fixup, r):
307                                charmap[i] = 1
308                        if not hascased:
309                            hascased = any(map(iscased, r))
310                    else:
311                        for i in r:
312                            charmap[i] = 1
313                elif op is NEGATE:
314                    out.append((op, av))
315                else:
316                    tail.append((op, av))
317            except IndexError:
318                if len(charmap) == 256:
319                    # character set contains non-UCS1 character codes
320                    charmap += b'\0' * 0xff00
321                    continue
322                # Character set contains non-BMP character codes.
323                if fixup:
324                    hascased = True
325                    # There are only two ranges of cased non-BMP characters:
326                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
327                    # and for both ranges RANGE_UNI_IGNORE works.
328                    if op is RANGE:
329                        op = RANGE_UNI_IGNORE
330                tail.append((op, av))
331            break
332
333    # compress character map
334    runs = []
335    q = 0
336    while True:
337        p = charmap.find(1, q)
338        if p < 0:
339            break
340        if len(runs) >= 2:
341            runs = None
342            break
343        q = charmap.find(0, p)
344        if q < 0:
345            runs.append((p, len(charmap)))
346            break
347        runs.append((p, q))
348    if runs is not None:
349        # use literal/range
350        for p, q in runs:
351            if q - p == 1:
352                out.append((LITERAL, p))
353            else:
354                out.append((RANGE, (p, q - 1)))
355        out += tail
356        # if the case was changed or new representation is more compact
357        if hascased or len(out) < len(charset):
358            return out, hascased
359        # else original character set is good enough
360        return charset, hascased
361
362    # use bitmap
363    if len(charmap) == 256:
364        data = _mk_bitmap(charmap)
365        out.append((CHARSET, data))
366        out += tail
367        return out, hascased
368
369    # To represent a big charset, first a bitmap of all characters in the
370    # set is constructed. Then, this bitmap is sliced into chunks of 256
371    # characters, duplicate chunks are eliminated, and each chunk is
372    # given a number. In the compiled expression, the charset is
373    # represented by a 32-bit word sequence, consisting of one word for
374    # the number of different chunks, a sequence of 256 bytes (64 words)
375    # of chunk numbers indexed by their original chunk position, and a
376    # sequence of 256-bit chunks (8 words each).
377
378    # Compression is normally good: in a typical charset, large ranges of
379    # Unicode will be either completely excluded (e.g. if only cyrillic
380    # letters are to be matched), or completely included (e.g. if large
381    # subranges of Kanji match). These ranges will be represented by
382    # chunks of all one-bits or all zero-bits.
383
384    # Matching can be also done efficiently: the more significant byte of
385    # the Unicode character is an index into the chunk number, and the
386    # less significant byte is a bit index in the chunk (just like the
387    # CHARSET matching).
388
389    charmap = bytes(charmap) # should be hashable
390    comps = {}
391    mapping = bytearray(256)
392    block = 0
393    data = bytearray()
394    for i in range(0, 65536, 256):
395        chunk = charmap[i: i + 256]
396        if chunk in comps:
397            mapping[i // 256] = comps[chunk]
398        else:
399            mapping[i // 256] = comps[chunk] = block
400            block += 1
401            data += chunk
402    data = _mk_bitmap(data)
403    data[0:0] = [block] + _bytes_to_codes(mapping)
404    out.append((BIGCHARSET, data))
405    out += tail
406    return out, hascased
407
408_CODEBITS = _sre.CODESIZE * 8
409MAXCODE = (1 << _CODEBITS) - 1
410_BITS_TRANS = b'0' + b'1' * 255
411def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
412    s = bits.translate(_BITS_TRANS)[::-1]
413    return [_int(s[i - _CODEBITS: i], 2)
414            for i in range(len(s), 0, -_CODEBITS)]
415
416def _bytes_to_codes(b):
417    # Convert block indices to word array
418    a = memoryview(b).cast('I')
419    assert a.itemsize == _sre.CODESIZE
420    assert len(a) * a.itemsize == len(b)
421    return a.tolist()
422
423def _simple(p):
424    # check if this subpattern is a "simple" operator
425    if len(p) != 1:
426        return False
427    op, av = p[0]
428    if op is SUBPATTERN:
429        return av[0] is None and _simple(av[-1])
430    return op in _UNIT_CODES
431
432def _generate_overlap_table(prefix):
433    """
434    Generate an overlap table for the following prefix.
435    An overlap table is a table of the same size as the prefix which
436    informs about the potential self-overlap for each index in the prefix:
437    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
438    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
439      prefix[0:k]
440    """
441    table = [0] * len(prefix)
442    for i in range(1, len(prefix)):
443        idx = table[i - 1]
444        while prefix[i] != prefix[idx]:
445            if idx == 0:
446                table[i] = 0
447                break
448            idx = table[idx - 1]
449        else:
450            table[i] = idx + 1
451    return table
452
453def _get_iscased(flags):
454    if not flags & SRE_FLAG_IGNORECASE:
455        return None
456    elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
457        return _sre.unicode_iscased
458    else:
459        return _sre.ascii_iscased
460
461def _get_literal_prefix(pattern, flags):
462    # look for literal prefix
463    prefix = []
464    prefixappend = prefix.append
465    prefix_skip = None
466    iscased = _get_iscased(flags)
467    for op, av in pattern.data:
468        if op is LITERAL:
469            if iscased and iscased(av):
470                break
471            prefixappend(av)
472        elif op is SUBPATTERN:
473            group, add_flags, del_flags, p = av
474            flags1 = _combine_flags(flags, add_flags, del_flags)
475            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
476                break
477            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
478            if prefix_skip is None:
479                if group is not None:
480                    prefix_skip = len(prefix)
481                elif prefix_skip1 is not None:
482                    prefix_skip = len(prefix) + prefix_skip1
483            prefix.extend(prefix1)
484            if not got_all:
485                break
486        else:
487            break
488    else:
489        return prefix, prefix_skip, True
490    return prefix, prefix_skip, False
491
492def _get_charset_prefix(pattern, flags):
493    while True:
494        if not pattern.data:
495            return None
496        op, av = pattern.data[0]
497        if op is not SUBPATTERN:
498            break
499        group, add_flags, del_flags, pattern = av
500        flags = _combine_flags(flags, add_flags, del_flags)
501        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
502            return None
503
504    iscased = _get_iscased(flags)
505    if op is LITERAL:
506        if iscased and iscased(av):
507            return None
508        return [(op, av)]
509    elif op is BRANCH:
510        charset = []
511        charsetappend = charset.append
512        for p in av[1]:
513            if not p:
514                return None
515            op, av = p[0]
516            if op is LITERAL and not (iscased and iscased(av)):
517                charsetappend((op, av))
518            else:
519                return None
520        return charset
521    elif op is IN:
522        charset = av
523        if iscased:
524            for op, av in charset:
525                if op is LITERAL:
526                    if iscased(av):
527                        return None
528                elif op is RANGE:
529                    if av[1] > 0xffff:
530                        return None
531                    if any(map(iscased, range(av[0], av[1]+1))):
532                        return None
533        return charset
534    return None
535
536def _compile_info(code, pattern, flags):
537    # internal: compile an info block.  in the current version,
538    # this contains min/max pattern width, and an optional literal
539    # prefix or a character map
540    lo, hi = pattern.getwidth()
541    if hi > MAXCODE:
542        hi = MAXCODE
543    if lo == 0:
544        code.extend([INFO, 4, 0, lo, hi])
545        return
546    # look for a literal prefix
547    prefix = []
548    prefix_skip = 0
549    charset = [] # not used
550    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
551        # look for literal prefix
552        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
553        # if no prefix, look for charset prefix
554        if not prefix:
555            charset = _get_charset_prefix(pattern, flags)
556##     if prefix:
557##         print("*** PREFIX", prefix, prefix_skip)
558##     if charset:
559##         print("*** CHARSET", charset)
560    # add an info block
561    emit = code.append
562    emit(INFO)
563    skip = len(code); emit(0)
564    # literal flag
565    mask = 0
566    if prefix:
567        mask = SRE_INFO_PREFIX
568        if prefix_skip is None and got_all:
569            mask = mask | SRE_INFO_LITERAL
570    elif charset:
571        mask = mask | SRE_INFO_CHARSET
572    emit(mask)
573    # pattern length
574    if lo < MAXCODE:
575        emit(lo)
576    else:
577        emit(MAXCODE)
578        prefix = prefix[:MAXCODE]
579    emit(min(hi, MAXCODE))
580    # add literal prefix
581    if prefix:
582        emit(len(prefix)) # length
583        if prefix_skip is None:
584            prefix_skip =  len(prefix)
585        emit(prefix_skip) # skip
586        code.extend(prefix)
587        # generate overlap table
588        code.extend(_generate_overlap_table(prefix))
589    elif charset:
590        charset, hascased = _optimize_charset(charset)
591        assert not hascased
592        _compile_charset(charset, flags, code)
593    code[skip] = len(code) - skip
594
595def isstring(obj):
596    return isinstance(obj, (str, bytes))
597
598def _code(p, flags):
599
600    flags = p.pattern.flags | flags
601    code = []
602
603    # compile info block
604    _compile_info(code, p, flags)
605
606    # compile the pattern
607    _compile(code, p.data, flags)
608
609    code.append(SUCCESS)
610
611    return code
612
613def _hex_code(code):
614    return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
615
616def dis(code):
617    import sys
618
619    labels = set()
620    level = 0
621    offset_width = len(str(len(code) - 1))
622
623    def dis_(start, end):
624        def print_(*args, to=None):
625            if to is not None:
626                labels.add(to)
627                args += ('(to %d)' % (to,),)
628            print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
629                  end='  '*(level-1))
630            print(*args)
631
632        def print_2(*args):
633            print(end=' '*(offset_width + 2*level))
634            print(*args)
635
636        nonlocal level
637        level += 1
638        i = start
639        while i < end:
640            start = i
641            op = code[i]
642            i += 1
643            op = OPCODES[op]
644            if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
645                      MAX_UNTIL, MIN_UNTIL, NEGATE):
646                print_(op)
647            elif op in (LITERAL, NOT_LITERAL,
648                        LITERAL_IGNORE, NOT_LITERAL_IGNORE,
649                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
650                        LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
651                arg = code[i]
652                i += 1
653                print_(op, '%#02x (%r)' % (arg, chr(arg)))
654            elif op is AT:
655                arg = code[i]
656                i += 1
657                arg = str(ATCODES[arg])
658                assert arg[:3] == 'AT_'
659                print_(op, arg[3:])
660            elif op is CATEGORY:
661                arg = code[i]
662                i += 1
663                arg = str(CHCODES[arg])
664                assert arg[:9] == 'CATEGORY_'
665                print_(op, arg[9:])
666            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
667                skip = code[i]
668                print_(op, skip, to=i+skip)
669                dis_(i+1, i+skip)
670                i += skip
671            elif op in (RANGE, RANGE_UNI_IGNORE):
672                lo, hi = code[i: i+2]
673                i += 2
674                print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
675            elif op is CHARSET:
676                print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
677                i += 256//_CODEBITS
678            elif op is BIGCHARSET:
679                arg = code[i]
680                i += 1
681                mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
682                                        for x in code[i: i + 256//_sre.CODESIZE]))
683                print_(op, arg, mapping)
684                i += 256//_sre.CODESIZE
685                level += 1
686                for j in range(arg):
687                    print_2(_hex_code(code[i: i + 256//_CODEBITS]))
688                    i += 256//_CODEBITS
689                level -= 1
690            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
691                        GROUPREF_LOC_IGNORE):
692                arg = code[i]
693                i += 1
694                print_(op, arg)
695            elif op is JUMP:
696                skip = code[i]
697                print_(op, skip, to=i+skip)
698                i += 1
699            elif op is BRANCH:
700                skip = code[i]
701                print_(op, skip, to=i+skip)
702                while skip:
703                    dis_(i+1, i+skip)
704                    i += skip
705                    start = i
706                    skip = code[i]
707                    if skip:
708                        print_('branch', skip, to=i+skip)
709                    else:
710                        print_(FAILURE)
711                i += 1
712            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
713                skip, min, max = code[i: i+3]
714                if max == MAXREPEAT:
715                    max = 'MAXREPEAT'
716                print_(op, skip, min, max, to=i+skip)
717                dis_(i+3, i+skip)
718                i += skip
719            elif op is GROUPREF_EXISTS:
720                arg, skip = code[i: i+2]
721                print_(op, arg, skip, to=i+skip)
722                i += 2
723            elif op in (ASSERT, ASSERT_NOT):
724                skip, arg = code[i: i+2]
725                print_(op, skip, arg, to=i+skip)
726                dis_(i+2, i+skip)
727                i += skip
728            elif op is INFO:
729                skip, flags, min, max = code[i: i+4]
730                if max == MAXREPEAT:
731                    max = 'MAXREPEAT'
732                print_(op, skip, bin(flags), min, max, to=i+skip)
733                start = i+4
734                if flags & SRE_INFO_PREFIX:
735                    prefix_len, prefix_skip = code[i+4: i+6]
736                    print_2('  prefix_skip', prefix_skip)
737                    start = i + 6
738                    prefix = code[start: start+prefix_len]
739                    print_2('  prefix',
740                            '[%s]' % ', '.join('%#02x' % x for x in prefix),
741                            '(%r)' % ''.join(map(chr, prefix)))
742                    start += prefix_len
743                    print_2('  overlap', code[start: start+prefix_len])
744                    start += prefix_len
745                if flags & SRE_INFO_CHARSET:
746                    level += 1
747                    print_2('in')
748                    dis_(start, i+skip)
749                    level -= 1
750                i += skip
751            else:
752                raise ValueError(op)
753
754        level -= 1
755
756    dis_(0, len(code))
757
758
759def compile(p, flags=0):
760    # internal: convert pattern list to internal format
761
762    if isstring(p):
763        pattern = p
764        p = sre_parse.parse(p, flags)
765    else:
766        pattern = None
767
768    code = _code(p, flags)
769
770    if flags & SRE_FLAG_DEBUG:
771        print()
772        dis(code)
773
774    # map in either direction
775    groupindex = p.pattern.groupdict
776    indexgroup = [None] * p.pattern.groups
777    for k, i in groupindex.items():
778        indexgroup[i] = k
779
780    return _sre.compile(
781        pattern, flags | p.pattern.flags, code,
782        p.pattern.groups-1,
783        groupindex, tuple(indexgroup)
784        )
785