sre_compile.py revision 29e383754e4a96b46d1bd9f72a49694cdb993850
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre, sys 14 15from sre_constants import * 16 17assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19if _sre.CODESIZE == 2: 20 MAXCODE = 65535 21else: 22 MAXCODE = 0xFFFFFFFFL 23 24def _identityfunction(x): 25 return x 26 27def _compile(code, pattern, flags): 28 # internal: compile a (sub)pattern 29 emit = code.append 30 _len = len 31 LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1} 32 REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1} 33 SUCCESS_CODES = {SUCCESS:1, FAILURE:1} 34 ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1} 35 for op, av in pattern: 36 if op in LITERAL_CODES: 37 if flags & SRE_FLAG_IGNORECASE: 38 emit(OPCODES[OP_IGNORE[op]]) 39 emit(_sre.getlower(av, flags)) 40 else: 41 emit(OPCODES[op]) 42 emit(av) 43 elif op is IN: 44 if flags & SRE_FLAG_IGNORECASE: 45 emit(OPCODES[OP_IGNORE[op]]) 46 def fixup(literal, flags=flags): 47 return _sre.getlower(literal, flags) 48 else: 49 emit(OPCODES[op]) 50 fixup = _identityfunction 51 skip = _len(code); emit(0) 52 _compile_charset(av, flags, code, fixup) 53 code[skip] = _len(code) - skip 54 elif op is ANY: 55 if flags & SRE_FLAG_DOTALL: 56 emit(OPCODES[ANY_ALL]) 57 else: 58 emit(OPCODES[ANY]) 59 elif op in REPEATING_CODES: 60 if flags & SRE_FLAG_TEMPLATE: 61 raise error, "internal: unsupported template operator" 62 emit(OPCODES[REPEAT]) 63 skip = _len(code); emit(0) 64 emit(av[0]) 65 emit(av[1]) 66 _compile(code, av[2], flags) 67 emit(OPCODES[SUCCESS]) 68 code[skip] = _len(code) - skip 69 elif _simple(av) and op is not REPEAT: 70 if op is MAX_REPEAT: 71 emit(OPCODES[REPEAT_ONE]) 72 else: 73 emit(OPCODES[MIN_REPEAT_ONE]) 74 skip = _len(code); emit(0) 75 emit(av[0]) 76 emit(av[1]) 77 _compile(code, av[2], flags) 78 emit(OPCODES[SUCCESS]) 79 code[skip] = _len(code) - skip 80 else: 81 emit(OPCODES[REPEAT]) 82 skip = _len(code); emit(0) 83 emit(av[0]) 84 emit(av[1]) 85 _compile(code, av[2], flags) 86 code[skip] = _len(code) - skip 87 if op is MAX_REPEAT: 88 emit(OPCODES[MAX_UNTIL]) 89 else: 90 emit(OPCODES[MIN_UNTIL]) 91 elif op is SUBPATTERN: 92 if av[0]: 93 emit(OPCODES[MARK]) 94 emit((av[0]-1)*2) 95 # _compile_info(code, av[1], flags) 96 _compile(code, av[1], flags) 97 if av[0]: 98 emit(OPCODES[MARK]) 99 emit((av[0]-1)*2+1) 100 elif op in SUCCESS_CODES: 101 emit(OPCODES[op]) 102 elif op in ASSERT_CODES: 103 emit(OPCODES[op]) 104 skip = _len(code); emit(0) 105 if av[0] >= 0: 106 emit(0) # look ahead 107 else: 108 lo, hi = av[1].getwidth() 109 if lo != hi: 110 raise error, "look-behind requires fixed-width pattern" 111 emit(lo) # look behind 112 _compile(code, av[1], flags) 113 emit(OPCODES[SUCCESS]) 114 code[skip] = _len(code) - skip 115 elif op is CALL: 116 emit(OPCODES[op]) 117 skip = _len(code); emit(0) 118 _compile(code, av, flags) 119 emit(OPCODES[SUCCESS]) 120 code[skip] = _len(code) - skip 121 elif op is AT: 122 emit(OPCODES[op]) 123 if flags & SRE_FLAG_MULTILINE: 124 av = AT_MULTILINE.get(av, av) 125 if flags & SRE_FLAG_LOCALE: 126 av = AT_LOCALE.get(av, av) 127 elif flags & SRE_FLAG_UNICODE: 128 av = AT_UNICODE.get(av, av) 129 emit(ATCODES[av]) 130 elif op is BRANCH: 131 emit(OPCODES[op]) 132 tail = [] 133 tailappend = tail.append 134 for av in av[1]: 135 skip = _len(code); emit(0) 136 # _compile_info(code, av, flags) 137 _compile(code, av, flags) 138 emit(OPCODES[JUMP]) 139 tailappend(_len(code)); emit(0) 140 code[skip] = _len(code) - skip 141 emit(0) # end of branch 142 for tail in tail: 143 code[tail] = _len(code) - tail 144 elif op is CATEGORY: 145 emit(OPCODES[op]) 146 if flags & SRE_FLAG_LOCALE: 147 av = CH_LOCALE[av] 148 elif flags & SRE_FLAG_UNICODE: 149 av = CH_UNICODE[av] 150 emit(CHCODES[av]) 151 elif op is GROUPREF: 152 if flags & SRE_FLAG_IGNORECASE: 153 emit(OPCODES[OP_IGNORE[op]]) 154 else: 155 emit(OPCODES[op]) 156 emit(av-1) 157 elif op is GROUPREF_EXISTS: 158 emit(OPCODES[op]) 159 emit((av[0]-1)*2) 160 skipyes = _len(code); emit(0) 161 _compile(code, av[1], flags) 162 if av[2]: 163 emit(OPCODES[JUMP]) 164 skipno = _len(code); emit(0) 165 code[skipyes] = _len(code) - skipyes + 1 166 _compile(code, av[2], flags) 167 code[skipno] = _len(code) - skipno 168 else: 169 code[skipyes] = _len(code) - skipyes + 1 170 else: 171 raise ValueError, ("unsupported operand type", op) 172 173def _compile_charset(charset, flags, code, fixup=None): 174 # compile charset subprogram 175 emit = code.append 176 if fixup is None: 177 fixup = _identityfunction 178 for op, av in _optimize_charset(charset, fixup): 179 emit(OPCODES[op]) 180 if op is NEGATE: 181 pass 182 elif op is LITERAL: 183 emit(fixup(av)) 184 elif op is RANGE: 185 emit(fixup(av[0])) 186 emit(fixup(av[1])) 187 elif op is CHARSET: 188 code.extend(av) 189 elif op is BIGCHARSET: 190 code.extend(av) 191 elif op is CATEGORY: 192 if flags & SRE_FLAG_LOCALE: 193 emit(CHCODES[CH_LOCALE[av]]) 194 elif flags & SRE_FLAG_UNICODE: 195 emit(CHCODES[CH_UNICODE[av]]) 196 else: 197 emit(CHCODES[av]) 198 else: 199 raise error, "internal: unsupported set operator" 200 emit(OPCODES[FAILURE]) 201 202def _optimize_charset(charset, fixup): 203 # internal: optimize character set 204 out = [] 205 outappend = out.append 206 charmap = [False]*256 207 try: 208 for op, av in charset: 209 if op is NEGATE: 210 outappend((op, av)) 211 elif op is LITERAL: 212 charmap[fixup(av)] = True 213 elif op is RANGE: 214 for i in range(fixup(av[0]), fixup(av[1])+1): 215 charmap[i] = True 216 elif op is CATEGORY: 217 # XXX: could append to charmap tail 218 return charset # cannot compress 219 except IndexError: 220 # character set contains unicode characters 221 return _optimize_unicode(charset, fixup) 222 # compress character map 223 i = p = n = 0 224 runs = [] 225 runsappend = runs.append 226 for c in charmap: 227 if c: 228 if n == 0: 229 p = i 230 n = n + 1 231 elif n: 232 runsappend((p, n)) 233 n = 0 234 i = i + 1 235 if n: 236 runsappend((p, n)) 237 if len(runs) <= 2: 238 # use literal/range 239 for p, n in runs: 240 if n == 1: 241 outappend((LITERAL, p)) 242 else: 243 outappend((RANGE, (p, p+n-1))) 244 if len(out) < len(charset): 245 return out 246 else: 247 # use bitmap 248 data = _mk_bitmap(charmap) 249 outappend((CHARSET, data)) 250 return out 251 return charset 252 253def _mk_bitmap(bits): 254 data = [] 255 dataappend = data.append 256 if _sre.CODESIZE == 2: 257 start = (1, 0) 258 else: 259 start = (1L, 0L) 260 m, v = start 261 for c in bits: 262 if c: 263 v = v + m 264 m = m + m 265 if m > MAXCODE: 266 dataappend(v) 267 m, v = start 268 return data 269 270# To represent a big charset, first a bitmap of all characters in the 271# set is constructed. Then, this bitmap is sliced into chunks of 256 272# characters, duplicate chunks are eliminitated, and each chunk is 273# given a number. In the compiled expression, the charset is 274# represented by a 16-bit word sequence, consisting of one word for 275# the number of different chunks, a sequence of 256 bytes (128 words) 276# of chunk numbers indexed by their original chunk position, and a 277# sequence of chunks (16 words each). 278 279# Compression is normally good: in a typical charset, large ranges of 280# Unicode will be either completely excluded (e.g. if only cyrillic 281# letters are to be matched), or completely included (e.g. if large 282# subranges of Kanji match). These ranges will be represented by 283# chunks of all one-bits or all zero-bits. 284 285# Matching can be also done efficiently: the more significant byte of 286# the Unicode character is an index into the chunk number, and the 287# less significant byte is a bit index in the chunk (just like the 288# CHARSET matching). 289 290# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 291# of the basic multilingual plane; an efficient representation 292# for all of UTF-16 has not yet been developed. This means, 293# in particular, that negated charsets cannot be represented as 294# bigcharsets. 295 296def _optimize_unicode(charset, fixup): 297 try: 298 import array 299 except ImportError: 300 return charset 301 charmap = [False]*65536 302 negate = 0 303 try: 304 for op, av in charset: 305 if op is NEGATE: 306 negate = 1 307 elif op is LITERAL: 308 charmap[fixup(av)] = True 309 elif op is RANGE: 310 for i in xrange(fixup(av[0]), fixup(av[1])+1): 311 charmap[i] = True 312 elif op is CATEGORY: 313 # XXX: could expand category 314 return charset # cannot compress 315 except IndexError: 316 # non-BMP characters 317 return charset 318 if negate: 319 if sys.maxunicode != 65535: 320 # XXX: negation does not work with big charsets 321 return charset 322 for i in xrange(65536): 323 charmap[i] = not charmap[i] 324 comps = {} 325 mapping = [0]*256 326 block = 0 327 data = [] 328 for i in xrange(256): 329 chunk = tuple(charmap[i*256:(i+1)*256]) 330 new = comps.setdefault(chunk, block) 331 mapping[i] = new 332 if new == block: 333 block = block + 1 334 data = data + _mk_bitmap(chunk) 335 header = [block] 336 if MAXCODE == 65535: 337 code = 'H' 338 else: 339 code = 'L' 340 # Convert block indices to byte array of 256 bytes 341 mapping = array.array('b', mapping).tostring() 342 # Convert byte array to word array 343 header = header + array.array(code, mapping).tolist() 344 data[0:0] = header 345 return [(BIGCHARSET, data)] 346 347def _simple(av): 348 # check if av is a "simple" operator 349 lo, hi = av[2].getwidth() 350 if lo == 0 and hi == MAXREPEAT: 351 raise error, "nothing to repeat" 352 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 353 354def _compile_info(code, pattern, flags): 355 # internal: compile an info block. in the current version, 356 # this contains min/max pattern width, and an optional literal 357 # prefix or a character map 358 lo, hi = pattern.getwidth() 359 if lo == 0: 360 return # not worth it 361 # look for a literal prefix 362 prefix = [] 363 prefixappend = prefix.append 364 prefix_skip = 0 365 charset = [] # not used 366 charsetappend = charset.append 367 if not (flags & SRE_FLAG_IGNORECASE): 368 # look for literal prefix 369 for op, av in pattern.data: 370 if op is LITERAL: 371 if len(prefix) == prefix_skip: 372 prefix_skip = prefix_skip + 1 373 prefixappend(av) 374 elif op is SUBPATTERN and len(av[1]) == 1: 375 op, av = av[1][0] 376 if op is LITERAL: 377 prefixappend(av) 378 else: 379 break 380 else: 381 break 382 # if no prefix, look for charset prefix 383 if not prefix and pattern.data: 384 op, av = pattern.data[0] 385 if op is SUBPATTERN and av[1]: 386 op, av = av[1][0] 387 if op is LITERAL: 388 charsetappend((op, av)) 389 elif op is BRANCH: 390 c = [] 391 cappend = c.append 392 for p in av[1]: 393 if not p: 394 break 395 op, av = p[0] 396 if op is LITERAL: 397 cappend((op, av)) 398 else: 399 break 400 else: 401 charset = c 402 elif op is BRANCH: 403 c = [] 404 cappend = c.append 405 for p in av[1]: 406 if not p: 407 break 408 op, av = p[0] 409 if op is LITERAL: 410 cappend((op, av)) 411 else: 412 break 413 else: 414 charset = c 415 elif op is IN: 416 charset = av 417## if prefix: 418## print "*** PREFIX", prefix, prefix_skip 419## if charset: 420## print "*** CHARSET", charset 421 # add an info block 422 emit = code.append 423 emit(OPCODES[INFO]) 424 skip = len(code); emit(0) 425 # literal flag 426 mask = 0 427 if prefix: 428 mask = SRE_INFO_PREFIX 429 if len(prefix) == prefix_skip == len(pattern.data): 430 mask = mask + SRE_INFO_LITERAL 431 elif charset: 432 mask = mask + SRE_INFO_CHARSET 433 emit(mask) 434 # pattern length 435 if lo < MAXCODE: 436 emit(lo) 437 else: 438 emit(MAXCODE) 439 prefix = prefix[:MAXCODE] 440 if hi < MAXCODE: 441 emit(hi) 442 else: 443 emit(0) 444 # add literal prefix 445 if prefix: 446 emit(len(prefix)) # length 447 emit(prefix_skip) # skip 448 code.extend(prefix) 449 # generate overlap table 450 table = [-1] + ([0]*len(prefix)) 451 for i in xrange(len(prefix)): 452 table[i+1] = table[i]+1 453 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 454 table[i+1] = table[table[i+1]-1]+1 455 code.extend(table[1:]) # don't store first entry 456 elif charset: 457 _compile_charset(charset, flags, code) 458 code[skip] = len(code) - skip 459 460try: 461 unicode 462except NameError: 463 STRING_TYPES = (type(""),) 464else: 465 STRING_TYPES = (type(""), type(unicode(""))) 466 467def isstring(obj): 468 for tp in STRING_TYPES: 469 if isinstance(obj, tp): 470 return 1 471 return 0 472 473def _code(p, flags): 474 475 flags = p.pattern.flags | flags 476 code = [] 477 478 # compile info block 479 _compile_info(code, p, flags) 480 481 # compile the pattern 482 _compile(code, p.data, flags) 483 484 code.append(OPCODES[SUCCESS]) 485 486 return code 487 488def compile(p, flags=0): 489 # internal: convert pattern list to internal format 490 491 if isstring(p): 492 import sre_parse 493 pattern = p 494 p = sre_parse.parse(p, flags) 495 else: 496 pattern = None 497 498 code = _code(p, flags) 499 500 # print code 501 502 # XXX: <fl> get rid of this limitation! 503 assert p.pattern.groups <= 100,\ 504 "sorry, but this version only supports 100 named groups" 505 506 # map in either direction 507 groupindex = p.pattern.groupdict 508 indexgroup = [None] * p.pattern.groups 509 for k, i in groupindex.items(): 510 indexgroup[i] = k 511 512 return _sre.compile( 513 pattern, flags, code, 514 p.pattern.groups-1, 515 groupindex, indexgroup 516 ) 517