sre_compile.py revision 6b826abc707eb101f8426382487de88cd4e7b046
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre, sys 14 15from sre_constants import * 16 17assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19if _sre.CODESIZE == 2: 20 MAXCODE = 65535 21else: 22 MAXCODE = 0xFFFFFFFF 23 24def _identityfunction(x): 25 return x 26 27def set(seq): 28 s = {} 29 for elem in seq: 30 s[elem] = 1 31 return s 32 33_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 34_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 35_SUCCESS_CODES = set([SUCCESS, FAILURE]) 36_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 37 38def _compile(code, pattern, flags): 39 # internal: compile a (sub)pattern 40 emit = code.append 41 _len = len 42 LITERAL_CODES = _LITERAL_CODES 43 REPEATING_CODES = _REPEATING_CODES 44 SUCCESS_CODES = _SUCCESS_CODES 45 ASSERT_CODES = _ASSERT_CODES 46 for op, av in pattern: 47 if op in LITERAL_CODES: 48 if flags & SRE_FLAG_IGNORECASE: 49 emit(OPCODES[OP_IGNORE[op]]) 50 emit(_sre.getlower(av, flags)) 51 else: 52 emit(OPCODES[op]) 53 emit(av) 54 elif op is IN: 55 if flags & SRE_FLAG_IGNORECASE: 56 emit(OPCODES[OP_IGNORE[op]]) 57 def fixup(literal, flags=flags): 58 return _sre.getlower(literal, flags) 59 else: 60 emit(OPCODES[op]) 61 fixup = _identityfunction 62 skip = _len(code); emit(0) 63 _compile_charset(av, flags, code, fixup) 64 code[skip] = _len(code) - skip 65 elif op is ANY: 66 if flags & SRE_FLAG_DOTALL: 67 emit(OPCODES[ANY_ALL]) 68 else: 69 emit(OPCODES[ANY]) 70 elif op in REPEATING_CODES: 71 if flags & SRE_FLAG_TEMPLATE: 72 raise error, "internal: unsupported template operator" 73 emit(OPCODES[REPEAT]) 74 skip = _len(code); emit(0) 75 emit(av[0]) 76 emit(av[1]) 77 _compile(code, av[2], flags) 78 emit(OPCODES[SUCCESS]) 79 code[skip] = _len(code) - skip 80 elif _simple(av) and op is not REPEAT: 81 if op is MAX_REPEAT: 82 emit(OPCODES[REPEAT_ONE]) 83 else: 84 emit(OPCODES[MIN_REPEAT_ONE]) 85 skip = _len(code); emit(0) 86 emit(av[0]) 87 emit(av[1]) 88 _compile(code, av[2], flags) 89 emit(OPCODES[SUCCESS]) 90 code[skip] = _len(code) - skip 91 else: 92 emit(OPCODES[REPEAT]) 93 skip = _len(code); emit(0) 94 emit(av[0]) 95 emit(av[1]) 96 _compile(code, av[2], flags) 97 code[skip] = _len(code) - skip 98 if op is MAX_REPEAT: 99 emit(OPCODES[MAX_UNTIL]) 100 else: 101 emit(OPCODES[MIN_UNTIL]) 102 elif op is SUBPATTERN: 103 if av[0]: 104 emit(OPCODES[MARK]) 105 emit((av[0]-1)*2) 106 # _compile_info(code, av[1], flags) 107 _compile(code, av[1], flags) 108 if av[0]: 109 emit(OPCODES[MARK]) 110 emit((av[0]-1)*2+1) 111 elif op in SUCCESS_CODES: 112 emit(OPCODES[op]) 113 elif op in ASSERT_CODES: 114 emit(OPCODES[op]) 115 skip = _len(code); emit(0) 116 if av[0] >= 0: 117 emit(0) # look ahead 118 else: 119 lo, hi = av[1].getwidth() 120 if lo != hi: 121 raise error, "look-behind requires fixed-width pattern" 122 emit(lo) # look behind 123 _compile(code, av[1], flags) 124 emit(OPCODES[SUCCESS]) 125 code[skip] = _len(code) - skip 126 elif op is CALL: 127 emit(OPCODES[op]) 128 skip = _len(code); emit(0) 129 _compile(code, av, flags) 130 emit(OPCODES[SUCCESS]) 131 code[skip] = _len(code) - skip 132 elif op is AT: 133 emit(OPCODES[op]) 134 if flags & SRE_FLAG_MULTILINE: 135 av = AT_MULTILINE.get(av, av) 136 if flags & SRE_FLAG_LOCALE: 137 av = AT_LOCALE.get(av, av) 138 elif flags & SRE_FLAG_UNICODE: 139 av = AT_UNICODE.get(av, av) 140 emit(ATCODES[av]) 141 elif op is BRANCH: 142 emit(OPCODES[op]) 143 tail = [] 144 tailappend = tail.append 145 for av in av[1]: 146 skip = _len(code); emit(0) 147 # _compile_info(code, av, flags) 148 _compile(code, av, flags) 149 emit(OPCODES[JUMP]) 150 tailappend(_len(code)); emit(0) 151 code[skip] = _len(code) - skip 152 emit(0) # end of branch 153 for tail in tail: 154 code[tail] = _len(code) - tail 155 elif op is CATEGORY: 156 emit(OPCODES[op]) 157 if flags & SRE_FLAG_LOCALE: 158 av = CH_LOCALE[av] 159 elif flags & SRE_FLAG_UNICODE: 160 av = CH_UNICODE[av] 161 emit(CHCODES[av]) 162 elif op is GROUPREF: 163 if flags & SRE_FLAG_IGNORECASE: 164 emit(OPCODES[OP_IGNORE[op]]) 165 else: 166 emit(OPCODES[op]) 167 emit(av-1) 168 elif op is GROUPREF_EXISTS: 169 emit(OPCODES[op]) 170 emit(av[0]-1) 171 skipyes = _len(code); emit(0) 172 _compile(code, av[1], flags) 173 if av[2]: 174 emit(OPCODES[JUMP]) 175 skipno = _len(code); emit(0) 176 code[skipyes] = _len(code) - skipyes + 1 177 _compile(code, av[2], flags) 178 code[skipno] = _len(code) - skipno 179 else: 180 code[skipyes] = _len(code) - skipyes + 1 181 else: 182 raise ValueError, ("unsupported operand type", op) 183 184def _compile_charset(charset, flags, code, fixup=None): 185 # compile charset subprogram 186 emit = code.append 187 if fixup is None: 188 fixup = _identityfunction 189 for op, av in _optimize_charset(charset, fixup): 190 emit(OPCODES[op]) 191 if op is NEGATE: 192 pass 193 elif op is LITERAL: 194 emit(fixup(av)) 195 elif op is RANGE: 196 emit(fixup(av[0])) 197 emit(fixup(av[1])) 198 elif op is CHARSET: 199 code.extend(av) 200 elif op is BIGCHARSET: 201 code.extend(av) 202 elif op is CATEGORY: 203 if flags & SRE_FLAG_LOCALE: 204 emit(CHCODES[CH_LOCALE[av]]) 205 elif flags & SRE_FLAG_UNICODE: 206 emit(CHCODES[CH_UNICODE[av]]) 207 else: 208 emit(CHCODES[av]) 209 else: 210 raise error, "internal: unsupported set operator" 211 emit(OPCODES[FAILURE]) 212 213def _optimize_charset(charset, fixup): 214 # internal: optimize character set 215 out = [] 216 outappend = out.append 217 charmap = [0]*256 218 try: 219 for op, av in charset: 220 if op is NEGATE: 221 outappend((op, av)) 222 elif op is LITERAL: 223 charmap[fixup(av)] = 1 224 elif op is RANGE: 225 for i in range(fixup(av[0]), fixup(av[1])+1): 226 charmap[i] = 1 227 elif op is CATEGORY: 228 # XXX: could append to charmap tail 229 return charset # cannot compress 230 except IndexError: 231 # character set contains unicode characters 232 return _optimize_unicode(charset, fixup) 233 # compress character map 234 i = p = n = 0 235 runs = [] 236 runsappend = runs.append 237 for c in charmap: 238 if c: 239 if n == 0: 240 p = i 241 n = n + 1 242 elif n: 243 runsappend((p, n)) 244 n = 0 245 i = i + 1 246 if n: 247 runsappend((p, n)) 248 if len(runs) <= 2: 249 # use literal/range 250 for p, n in runs: 251 if n == 1: 252 outappend((LITERAL, p)) 253 else: 254 outappend((RANGE, (p, p+n-1))) 255 if len(out) < len(charset): 256 return out 257 else: 258 # use bitmap 259 data = _mk_bitmap(charmap) 260 outappend((CHARSET, data)) 261 return out 262 return charset 263 264def _mk_bitmap(bits): 265 data = [] 266 dataappend = data.append 267 if _sre.CODESIZE == 2: 268 start = (1, 0) 269 else: 270 start = (1, 0) 271 m, v = start 272 for c in bits: 273 if c: 274 v = v + m 275 m = m + m 276 if m > MAXCODE: 277 dataappend(v) 278 m, v = start 279 return data 280 281# To represent a big charset, first a bitmap of all characters in the 282# set is constructed. Then, this bitmap is sliced into chunks of 256 283# characters, duplicate chunks are eliminitated, and each chunk is 284# given a number. In the compiled expression, the charset is 285# represented by a 16-bit word sequence, consisting of one word for 286# the number of different chunks, a sequence of 256 bytes (128 words) 287# of chunk numbers indexed by their original chunk position, and a 288# sequence of chunks (16 words each). 289 290# Compression is normally good: in a typical charset, large ranges of 291# Unicode will be either completely excluded (e.g. if only cyrillic 292# letters are to be matched), or completely included (e.g. if large 293# subranges of Kanji match). These ranges will be represented by 294# chunks of all one-bits or all zero-bits. 295 296# Matching can be also done efficiently: the more significant byte of 297# the Unicode character is an index into the chunk number, and the 298# less significant byte is a bit index in the chunk (just like the 299# CHARSET matching). 300 301# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 302# of the basic multilingual plane; an efficient representation 303# for all of UTF-16 has not yet been developed. This means, 304# in particular, that negated charsets cannot be represented as 305# bigcharsets. 306 307def _optimize_unicode(charset, fixup): 308 try: 309 import array 310 except ImportError: 311 return charset 312 charmap = [0]*65536 313 negate = 0 314 try: 315 for op, av in charset: 316 if op is NEGATE: 317 negate = 1 318 elif op is LITERAL: 319 charmap[fixup(av)] = 1 320 elif op is RANGE: 321 for i in range(fixup(av[0]), fixup(av[1])+1): 322 charmap[i] = 1 323 elif op is CATEGORY: 324 # XXX: could expand category 325 return charset # cannot compress 326 except IndexError: 327 # non-BMP characters 328 return charset 329 if negate: 330 if sys.maxunicode != 65535: 331 # XXX: negation does not work with big charsets 332 return charset 333 for i in range(65536): 334 charmap[i] = not charmap[i] 335 comps = {} 336 mapping = [0]*256 337 block = 0 338 data = [] 339 for i in range(256): 340 chunk = tuple(charmap[i*256:(i+1)*256]) 341 new = comps.setdefault(chunk, block) 342 mapping[i] = new 343 if new == block: 344 block = block + 1 345 data = data + _mk_bitmap(chunk) 346 header = [block] 347 if _sre.CODESIZE == 2: 348 code = 'H' 349 else: 350 code = 'I' 351 # Convert block indices to byte array of 256 bytes 352 mapping = array.array('b', mapping).tostring() 353 # Convert byte array to word array 354 mapping = array.array(code, mapping) 355 assert mapping.itemsize == _sre.CODESIZE 356 assert len(mapping) * mapping.itemsize == 256 357 header = header + mapping.tolist() 358 data[0:0] = header 359 return [(BIGCHARSET, data)] 360 361def _simple(av): 362 # check if av is a "simple" operator 363 lo, hi = av[2].getwidth() 364 if lo == 0 and hi == MAXREPEAT: 365 raise error, "nothing to repeat" 366 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 367 368def _compile_info(code, pattern, flags): 369 # internal: compile an info block. in the current version, 370 # this contains min/max pattern width, and an optional literal 371 # prefix or a character map 372 lo, hi = pattern.getwidth() 373 if lo == 0: 374 return # not worth it 375 # look for a literal prefix 376 prefix = [] 377 prefixappend = prefix.append 378 prefix_skip = 0 379 charset = [] # not used 380 charsetappend = charset.append 381 if not (flags & SRE_FLAG_IGNORECASE): 382 # look for literal prefix 383 for op, av in pattern.data: 384 if op is LITERAL: 385 if len(prefix) == prefix_skip: 386 prefix_skip = prefix_skip + 1 387 prefixappend(av) 388 elif op is SUBPATTERN and len(av[1]) == 1: 389 op, av = av[1][0] 390 if op is LITERAL: 391 prefixappend(av) 392 else: 393 break 394 else: 395 break 396 # if no prefix, look for charset prefix 397 if not prefix and pattern.data: 398 op, av = pattern.data[0] 399 if op is SUBPATTERN and av[1]: 400 op, av = av[1][0] 401 if op is LITERAL: 402 charsetappend((op, av)) 403 elif op is BRANCH: 404 c = [] 405 cappend = c.append 406 for p in av[1]: 407 if not p: 408 break 409 op, av = p[0] 410 if op is LITERAL: 411 cappend((op, av)) 412 else: 413 break 414 else: 415 charset = c 416 elif op is BRANCH: 417 c = [] 418 cappend = c.append 419 for p in av[1]: 420 if not p: 421 break 422 op, av = p[0] 423 if op is LITERAL: 424 cappend((op, av)) 425 else: 426 break 427 else: 428 charset = c 429 elif op is IN: 430 charset = av 431## if prefix: 432## print "*** PREFIX", prefix, prefix_skip 433## if charset: 434## print "*** CHARSET", charset 435 # add an info block 436 emit = code.append 437 emit(OPCODES[INFO]) 438 skip = len(code); emit(0) 439 # literal flag 440 mask = 0 441 if prefix: 442 mask = SRE_INFO_PREFIX 443 if len(prefix) == prefix_skip == len(pattern.data): 444 mask = mask + SRE_INFO_LITERAL 445 elif charset: 446 mask = mask + SRE_INFO_CHARSET 447 emit(mask) 448 # pattern length 449 if lo < MAXCODE: 450 emit(lo) 451 else: 452 emit(MAXCODE) 453 prefix = prefix[:MAXCODE] 454 if hi < MAXCODE: 455 emit(hi) 456 else: 457 emit(0) 458 # add literal prefix 459 if prefix: 460 emit(len(prefix)) # length 461 emit(prefix_skip) # skip 462 code.extend(prefix) 463 # generate overlap table 464 table = [-1] + ([0]*len(prefix)) 465 for i in range(len(prefix)): 466 table[i+1] = table[i]+1 467 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 468 table[i+1] = table[table[i+1]-1]+1 469 code.extend(table[1:]) # don't store first entry 470 elif charset: 471 _compile_charset(charset, flags, code) 472 code[skip] = len(code) - skip 473 474def isstring(obj): 475 return isinstance(obj, basestring) 476 477def _code(p, flags): 478 479 flags = p.pattern.flags | flags 480 code = [] 481 482 # compile info block 483 _compile_info(code, p, flags) 484 485 # compile the pattern 486 _compile(code, p.data, flags) 487 488 code.append(OPCODES[SUCCESS]) 489 490 return code 491 492def compile(p, flags=0): 493 # internal: convert pattern list to internal format 494 495 if isstring(p): 496 import sre_parse 497 pattern = p 498 p = sre_parse.parse(p, flags) 499 else: 500 pattern = None 501 502 code = _code(p, flags) 503 504 # print code 505 506 # XXX: <fl> get rid of this limitation! 507 if p.pattern.groups > 100: 508 raise AssertionError( 509 "sorry, but this version only supports 100 named groups" 510 ) 511 512 # map in either direction 513 groupindex = p.pattern.groupdict 514 indexgroup = [None] * p.pattern.groups 515 for k, i in groupindex.items(): 516 indexgroup[i] = k 517 518 return _sre.compile( 519 pattern, flags, code, 520 p.pattern.groups-1, 521 groupindex, indexgroup 522 ) 523