1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre, sys 14import sre_parse 15from sre_constants import * 16 17assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19if _sre.CODESIZE == 2: 20 MAXCODE = 65535 21else: 22 MAXCODE = 0xFFFFFFFFL 23 24def _identityfunction(x): 25 return x 26 27_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 28_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 29_SUCCESS_CODES = set([SUCCESS, FAILURE]) 30_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 31 32def _compile(code, pattern, flags): 33 # internal: compile a (sub)pattern 34 emit = code.append 35 _len = len 36 LITERAL_CODES = _LITERAL_CODES 37 REPEATING_CODES = _REPEATING_CODES 38 SUCCESS_CODES = _SUCCESS_CODES 39 ASSERT_CODES = _ASSERT_CODES 40 for op, av in pattern: 41 if op in LITERAL_CODES: 42 if flags & SRE_FLAG_IGNORECASE: 43 emit(OPCODES[OP_IGNORE[op]]) 44 emit(_sre.getlower(av, flags)) 45 else: 46 emit(OPCODES[op]) 47 emit(av) 48 elif op is IN: 49 if flags & SRE_FLAG_IGNORECASE: 50 emit(OPCODES[OP_IGNORE[op]]) 51 def fixup(literal, flags=flags): 52 return _sre.getlower(literal, flags) 53 else: 54 emit(OPCODES[op]) 55 fixup = _identityfunction 56 skip = _len(code); emit(0) 57 _compile_charset(av, flags, code, fixup) 58 code[skip] = _len(code) - skip 59 elif op is ANY: 60 if flags & SRE_FLAG_DOTALL: 61 emit(OPCODES[ANY_ALL]) 62 else: 63 emit(OPCODES[ANY]) 64 elif op in REPEATING_CODES: 65 if flags & SRE_FLAG_TEMPLATE: 66 raise error, "internal: unsupported template operator" 67 emit(OPCODES[REPEAT]) 68 skip = _len(code); emit(0) 69 emit(av[0]) 70 emit(av[1]) 71 _compile(code, av[2], flags) 72 emit(OPCODES[SUCCESS]) 73 code[skip] = _len(code) - skip 74 elif _simple(av) and op is not REPEAT: 75 if op is MAX_REPEAT: 76 emit(OPCODES[REPEAT_ONE]) 77 else: 78 emit(OPCODES[MIN_REPEAT_ONE]) 79 skip = _len(code); emit(0) 80 emit(av[0]) 81 emit(av[1]) 82 _compile(code, av[2], flags) 83 emit(OPCODES[SUCCESS]) 84 code[skip] = _len(code) - skip 85 else: 86 emit(OPCODES[REPEAT]) 87 skip = _len(code); emit(0) 88 emit(av[0]) 89 emit(av[1]) 90 _compile(code, av[2], flags) 91 code[skip] = _len(code) - skip 92 if op is MAX_REPEAT: 93 emit(OPCODES[MAX_UNTIL]) 94 else: 95 emit(OPCODES[MIN_UNTIL]) 96 elif op is SUBPATTERN: 97 if av[0]: 98 emit(OPCODES[MARK]) 99 emit((av[0]-1)*2) 100 # _compile_info(code, av[1], flags) 101 _compile(code, av[1], flags) 102 if av[0]: 103 emit(OPCODES[MARK]) 104 emit((av[0]-1)*2+1) 105 elif op in SUCCESS_CODES: 106 emit(OPCODES[op]) 107 elif op in ASSERT_CODES: 108 emit(OPCODES[op]) 109 skip = _len(code); emit(0) 110 if av[0] >= 0: 111 emit(0) # look ahead 112 else: 113 lo, hi = av[1].getwidth() 114 if lo != hi: 115 raise error, "look-behind requires fixed-width pattern" 116 emit(lo) # look behind 117 _compile(code, av[1], flags) 118 emit(OPCODES[SUCCESS]) 119 code[skip] = _len(code) - skip 120 elif op is CALL: 121 emit(OPCODES[op]) 122 skip = _len(code); emit(0) 123 _compile(code, av, flags) 124 emit(OPCODES[SUCCESS]) 125 code[skip] = _len(code) - skip 126 elif op is AT: 127 emit(OPCODES[op]) 128 if flags & SRE_FLAG_MULTILINE: 129 av = AT_MULTILINE.get(av, av) 130 if flags & SRE_FLAG_LOCALE: 131 av = AT_LOCALE.get(av, av) 132 elif flags & SRE_FLAG_UNICODE: 133 av = AT_UNICODE.get(av, av) 134 emit(ATCODES[av]) 135 elif op is BRANCH: 136 emit(OPCODES[op]) 137 tail = [] 138 tailappend = tail.append 139 for av in av[1]: 140 skip = _len(code); emit(0) 141 # _compile_info(code, av, flags) 142 _compile(code, av, flags) 143 emit(OPCODES[JUMP]) 144 tailappend(_len(code)); emit(0) 145 code[skip] = _len(code) - skip 146 emit(0) # end of branch 147 for tail in tail: 148 code[tail] = _len(code) - tail 149 elif op is CATEGORY: 150 emit(OPCODES[op]) 151 if flags & SRE_FLAG_LOCALE: 152 av = CH_LOCALE[av] 153 elif flags & SRE_FLAG_UNICODE: 154 av = CH_UNICODE[av] 155 emit(CHCODES[av]) 156 elif op is GROUPREF: 157 if flags & SRE_FLAG_IGNORECASE: 158 emit(OPCODES[OP_IGNORE[op]]) 159 else: 160 emit(OPCODES[op]) 161 emit(av-1) 162 elif op is GROUPREF_EXISTS: 163 emit(OPCODES[op]) 164 emit(av[0]-1) 165 skipyes = _len(code); emit(0) 166 _compile(code, av[1], flags) 167 if av[2]: 168 emit(OPCODES[JUMP]) 169 skipno = _len(code); emit(0) 170 code[skipyes] = _len(code) - skipyes + 1 171 _compile(code, av[2], flags) 172 code[skipno] = _len(code) - skipno 173 else: 174 code[skipyes] = _len(code) - skipyes + 1 175 else: 176 raise ValueError, ("unsupported operand type", op) 177 178def _compile_charset(charset, flags, code, fixup=None): 179 # compile charset subprogram 180 emit = code.append 181 if fixup is None: 182 fixup = _identityfunction 183 for op, av in _optimize_charset(charset, fixup): 184 emit(OPCODES[op]) 185 if op is NEGATE: 186 pass 187 elif op is LITERAL: 188 emit(fixup(av)) 189 elif op is RANGE: 190 emit(fixup(av[0])) 191 emit(fixup(av[1])) 192 elif op is CHARSET: 193 code.extend(av) 194 elif op is BIGCHARSET: 195 code.extend(av) 196 elif op is CATEGORY: 197 if flags & SRE_FLAG_LOCALE: 198 emit(CHCODES[CH_LOCALE[av]]) 199 elif flags & SRE_FLAG_UNICODE: 200 emit(CHCODES[CH_UNICODE[av]]) 201 else: 202 emit(CHCODES[av]) 203 else: 204 raise error, "internal: unsupported set operator" 205 emit(OPCODES[FAILURE]) 206 207def _optimize_charset(charset, fixup): 208 # internal: optimize character set 209 out = [] 210 outappend = out.append 211 charmap = [0]*256 212 try: 213 for op, av in charset: 214 if op is NEGATE: 215 outappend((op, av)) 216 elif op is LITERAL: 217 charmap[fixup(av)] = 1 218 elif op is RANGE: 219 for i in range(fixup(av[0]), fixup(av[1])+1): 220 charmap[i] = 1 221 elif op is CATEGORY: 222 # XXX: could append to charmap tail 223 return charset # cannot compress 224 except IndexError: 225 # character set contains unicode characters 226 return _optimize_unicode(charset, fixup) 227 # compress character map 228 i = p = n = 0 229 runs = [] 230 runsappend = runs.append 231 for c in charmap: 232 if c: 233 if n == 0: 234 p = i 235 n = n + 1 236 elif n: 237 runsappend((p, n)) 238 n = 0 239 i = i + 1 240 if n: 241 runsappend((p, n)) 242 if len(runs) <= 2: 243 # use literal/range 244 for p, n in runs: 245 if n == 1: 246 outappend((LITERAL, p)) 247 else: 248 outappend((RANGE, (p, p+n-1))) 249 if len(out) < len(charset): 250 return out 251 else: 252 # use bitmap 253 data = _mk_bitmap(charmap) 254 outappend((CHARSET, data)) 255 return out 256 return charset 257 258def _mk_bitmap(bits): 259 data = [] 260 dataappend = data.append 261 if _sre.CODESIZE == 2: 262 start = (1, 0) 263 else: 264 start = (1L, 0L) 265 m, v = start 266 for c in bits: 267 if c: 268 v = v + m 269 m = m + m 270 if m > MAXCODE: 271 dataappend(v) 272 m, v = start 273 return data 274 275# To represent a big charset, first a bitmap of all characters in the 276# set is constructed. Then, this bitmap is sliced into chunks of 256 277# characters, duplicate chunks are eliminated, and each chunk is 278# given a number. In the compiled expression, the charset is 279# represented by a 16-bit word sequence, consisting of one word for 280# the number of different chunks, a sequence of 256 bytes (128 words) 281# of chunk numbers indexed by their original chunk position, and a 282# sequence of chunks (16 words each). 283 284# Compression is normally good: in a typical charset, large ranges of 285# Unicode will be either completely excluded (e.g. if only cyrillic 286# letters are to be matched), or completely included (e.g. if large 287# subranges of Kanji match). These ranges will be represented by 288# chunks of all one-bits or all zero-bits. 289 290# Matching can be also done efficiently: the more significant byte of 291# the Unicode character is an index into the chunk number, and the 292# less significant byte is a bit index in the chunk (just like the 293# CHARSET matching). 294 295# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 296# of the basic multilingual plane; an efficient representation 297# for all of UTF-16 has not yet been developed. This means, 298# in particular, that negated charsets cannot be represented as 299# bigcharsets. 300 301def _optimize_unicode(charset, fixup): 302 try: 303 import array 304 except ImportError: 305 return charset 306 charmap = [0]*65536 307 negate = 0 308 try: 309 for op, av in charset: 310 if op is NEGATE: 311 negate = 1 312 elif op is LITERAL: 313 charmap[fixup(av)] = 1 314 elif op is RANGE: 315 for i in xrange(fixup(av[0]), fixup(av[1])+1): 316 charmap[i] = 1 317 elif op is CATEGORY: 318 # XXX: could expand category 319 return charset # cannot compress 320 except IndexError: 321 # non-BMP characters 322 return charset 323 if negate: 324 if sys.maxunicode != 65535: 325 # XXX: negation does not work with big charsets 326 return charset 327 for i in xrange(65536): 328 charmap[i] = not charmap[i] 329 comps = {} 330 mapping = [0]*256 331 block = 0 332 data = [] 333 for i in xrange(256): 334 chunk = tuple(charmap[i*256:(i+1)*256]) 335 new = comps.setdefault(chunk, block) 336 mapping[i] = new 337 if new == block: 338 block = block + 1 339 data = data + _mk_bitmap(chunk) 340 header = [block] 341 if _sre.CODESIZE == 2: 342 code = 'H' 343 else: 344 code = 'I' 345 # Convert block indices to byte array of 256 bytes 346 mapping = array.array('b', mapping).tostring() 347 # Convert byte array to word array 348 mapping = array.array(code, mapping) 349 assert mapping.itemsize == _sre.CODESIZE 350 header = header + mapping.tolist() 351 data[0:0] = header 352 return [(BIGCHARSET, data)] 353 354def _simple(av): 355 # check if av is a "simple" operator 356 lo, hi = av[2].getwidth() 357 if lo == 0 and hi == MAXREPEAT: 358 raise error, "nothing to repeat" 359 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 360 361def _compile_info(code, pattern, flags): 362 # internal: compile an info block. in the current version, 363 # this contains min/max pattern width, and an optional literal 364 # prefix or a character map 365 lo, hi = pattern.getwidth() 366 if lo == 0: 367 return # not worth it 368 # look for a literal prefix 369 prefix = [] 370 prefixappend = prefix.append 371 prefix_skip = 0 372 charset = [] # not used 373 charsetappend = charset.append 374 if not (flags & SRE_FLAG_IGNORECASE): 375 # look for literal prefix 376 for op, av in pattern.data: 377 if op is LITERAL: 378 if len(prefix) == prefix_skip: 379 prefix_skip = prefix_skip + 1 380 prefixappend(av) 381 elif op is SUBPATTERN and len(av[1]) == 1: 382 op, av = av[1][0] 383 if op is LITERAL: 384 prefixappend(av) 385 else: 386 break 387 else: 388 break 389 # if no prefix, look for charset prefix 390 if not prefix and pattern.data: 391 op, av = pattern.data[0] 392 if op is SUBPATTERN and av[1]: 393 op, av = av[1][0] 394 if op is LITERAL: 395 charsetappend((op, av)) 396 elif op is BRANCH: 397 c = [] 398 cappend = c.append 399 for p in av[1]: 400 if not p: 401 break 402 op, av = p[0] 403 if op is LITERAL: 404 cappend((op, av)) 405 else: 406 break 407 else: 408 charset = c 409 elif op is BRANCH: 410 c = [] 411 cappend = c.append 412 for p in av[1]: 413 if not p: 414 break 415 op, av = p[0] 416 if op is LITERAL: 417 cappend((op, av)) 418 else: 419 break 420 else: 421 charset = c 422 elif op is IN: 423 charset = av 424## if prefix: 425## print "*** PREFIX", prefix, prefix_skip 426## if charset: 427## print "*** CHARSET", charset 428 # add an info block 429 emit = code.append 430 emit(OPCODES[INFO]) 431 skip = len(code); emit(0) 432 # literal flag 433 mask = 0 434 if prefix: 435 mask = SRE_INFO_PREFIX 436 if len(prefix) == prefix_skip == len(pattern.data): 437 mask = mask + SRE_INFO_LITERAL 438 elif charset: 439 mask = mask + SRE_INFO_CHARSET 440 emit(mask) 441 # pattern length 442 if lo < MAXCODE: 443 emit(lo) 444 else: 445 emit(MAXCODE) 446 prefix = prefix[:MAXCODE] 447 if hi < MAXCODE: 448 emit(hi) 449 else: 450 emit(0) 451 # add literal prefix 452 if prefix: 453 emit(len(prefix)) # length 454 emit(prefix_skip) # skip 455 code.extend(prefix) 456 # generate overlap table 457 table = [-1] + ([0]*len(prefix)) 458 for i in xrange(len(prefix)): 459 table[i+1] = table[i]+1 460 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 461 table[i+1] = table[table[i+1]-1]+1 462 code.extend(table[1:]) # don't store first entry 463 elif charset: 464 _compile_charset(charset, flags, code) 465 code[skip] = len(code) - skip 466 467try: 468 unicode 469except NameError: 470 STRING_TYPES = (type(""),) 471else: 472 STRING_TYPES = (type(""), type(unicode(""))) 473 474def isstring(obj): 475 for tp in STRING_TYPES: 476 if isinstance(obj, tp): 477 return 1 478 return 0 479 480def _code(p, flags): 481 482 flags = p.pattern.flags | flags 483 code = [] 484 485 # compile info block 486 _compile_info(code, p, flags) 487 488 # compile the pattern 489 _compile(code, p.data, flags) 490 491 code.append(OPCODES[SUCCESS]) 492 493 return code 494 495def compile(p, flags=0): 496 # internal: convert pattern list to internal format 497 498 if isstring(p): 499 pattern = p 500 p = sre_parse.parse(p, flags) 501 else: 502 pattern = None 503 504 code = _code(p, flags) 505 506 # print code 507 508 # XXX: <fl> get rid of this limitation! 509 if p.pattern.groups > 100: 510 raise AssertionError( 511 "sorry, but this version only supports 100 named groups" 512 ) 513 514 # map in either direction 515 groupindex = p.pattern.groupdict 516 indexgroup = [None] * p.pattern.groups 517 for k, i in groupindex.items(): 518 indexgroup[i] = k 519 520 return _sre.compile( 521 pattern, flags | p.pattern.flags, code, 522 p.pattern.groups-1, 523 groupindex, indexgroup 524 ) 525