sre_compile.py revision 5e376a7809d7efe014f3f9b967a4554d79e10328
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre, sys 14import sre_parse 15from sre_constants import * 16from _sre import MAXREPEAT 17 18assert _sre.MAGIC == MAGIC, "SRE module mismatch" 19 20if _sre.CODESIZE == 2: 21 MAXCODE = 65535 22else: 23 MAXCODE = 0xFFFFFFFF 24 25def _identityfunction(x): 26 return x 27 28_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 29_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 30_SUCCESS_CODES = set([SUCCESS, FAILURE]) 31_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 32 33def _compile(code, pattern, flags): 34 # internal: compile a (sub)pattern 35 emit = code.append 36 _len = len 37 LITERAL_CODES = _LITERAL_CODES 38 REPEATING_CODES = _REPEATING_CODES 39 SUCCESS_CODES = _SUCCESS_CODES 40 ASSERT_CODES = _ASSERT_CODES 41 for op, av in pattern: 42 if op in LITERAL_CODES: 43 if flags & SRE_FLAG_IGNORECASE: 44 emit(OPCODES[OP_IGNORE[op]]) 45 emit(_sre.getlower(av, flags)) 46 else: 47 emit(OPCODES[op]) 48 emit(av) 49 elif op is IN: 50 if flags & SRE_FLAG_IGNORECASE: 51 emit(OPCODES[OP_IGNORE[op]]) 52 def fixup(literal, flags=flags): 53 return _sre.getlower(literal, flags) 54 else: 55 emit(OPCODES[op]) 56 fixup = _identityfunction 57 skip = _len(code); emit(0) 58 _compile_charset(av, flags, code, fixup) 59 code[skip] = _len(code) - skip 60 elif op is ANY: 61 if flags & SRE_FLAG_DOTALL: 62 emit(OPCODES[ANY_ALL]) 63 else: 64 emit(OPCODES[ANY]) 65 elif op in REPEATING_CODES: 66 if flags & SRE_FLAG_TEMPLATE: 67 raise error("internal: unsupported template operator") 68 emit(OPCODES[REPEAT]) 69 skip = _len(code); emit(0) 70 emit(av[0]) 71 emit(av[1]) 72 _compile(code, av[2], flags) 73 emit(OPCODES[SUCCESS]) 74 code[skip] = _len(code) - skip 75 elif _simple(av) and op is not REPEAT: 76 if op is MAX_REPEAT: 77 emit(OPCODES[REPEAT_ONE]) 78 else: 79 emit(OPCODES[MIN_REPEAT_ONE]) 80 skip = _len(code); emit(0) 81 emit(av[0]) 82 emit(av[1]) 83 _compile(code, av[2], flags) 84 emit(OPCODES[SUCCESS]) 85 code[skip] = _len(code) - skip 86 else: 87 emit(OPCODES[REPEAT]) 88 skip = _len(code); emit(0) 89 emit(av[0]) 90 emit(av[1]) 91 _compile(code, av[2], flags) 92 code[skip] = _len(code) - skip 93 if op is MAX_REPEAT: 94 emit(OPCODES[MAX_UNTIL]) 95 else: 96 emit(OPCODES[MIN_UNTIL]) 97 elif op is SUBPATTERN: 98 if av[0]: 99 emit(OPCODES[MARK]) 100 emit((av[0]-1)*2) 101 # _compile_info(code, av[1], flags) 102 _compile(code, av[1], flags) 103 if av[0]: 104 emit(OPCODES[MARK]) 105 emit((av[0]-1)*2+1) 106 elif op in SUCCESS_CODES: 107 emit(OPCODES[op]) 108 elif op in ASSERT_CODES: 109 emit(OPCODES[op]) 110 skip = _len(code); emit(0) 111 if av[0] >= 0: 112 emit(0) # look ahead 113 else: 114 lo, hi = av[1].getwidth() 115 if lo != hi: 116 raise error("look-behind requires fixed-width pattern") 117 emit(lo) # look behind 118 _compile(code, av[1], flags) 119 emit(OPCODES[SUCCESS]) 120 code[skip] = _len(code) - skip 121 elif op is CALL: 122 emit(OPCODES[op]) 123 skip = _len(code); emit(0) 124 _compile(code, av, flags) 125 emit(OPCODES[SUCCESS]) 126 code[skip] = _len(code) - skip 127 elif op is AT: 128 emit(OPCODES[op]) 129 if flags & SRE_FLAG_MULTILINE: 130 av = AT_MULTILINE.get(av, av) 131 if flags & SRE_FLAG_LOCALE: 132 av = AT_LOCALE.get(av, av) 133 elif flags & SRE_FLAG_UNICODE: 134 av = AT_UNICODE.get(av, av) 135 emit(ATCODES[av]) 136 elif op is BRANCH: 137 emit(OPCODES[op]) 138 tail = [] 139 tailappend = tail.append 140 for av in av[1]: 141 skip = _len(code); emit(0) 142 # _compile_info(code, av, flags) 143 _compile(code, av, flags) 144 emit(OPCODES[JUMP]) 145 tailappend(_len(code)); emit(0) 146 code[skip] = _len(code) - skip 147 emit(0) # end of branch 148 for tail in tail: 149 code[tail] = _len(code) - tail 150 elif op is CATEGORY: 151 emit(OPCODES[op]) 152 if flags & SRE_FLAG_LOCALE: 153 av = CH_LOCALE[av] 154 elif flags & SRE_FLAG_UNICODE: 155 av = CH_UNICODE[av] 156 emit(CHCODES[av]) 157 elif op is GROUPREF: 158 if flags & SRE_FLAG_IGNORECASE: 159 emit(OPCODES[OP_IGNORE[op]]) 160 else: 161 emit(OPCODES[op]) 162 emit(av-1) 163 elif op is GROUPREF_EXISTS: 164 emit(OPCODES[op]) 165 emit(av[0]-1) 166 skipyes = _len(code); emit(0) 167 _compile(code, av[1], flags) 168 if av[2]: 169 emit(OPCODES[JUMP]) 170 skipno = _len(code); emit(0) 171 code[skipyes] = _len(code) - skipyes + 1 172 _compile(code, av[2], flags) 173 code[skipno] = _len(code) - skipno 174 else: 175 code[skipyes] = _len(code) - skipyes + 1 176 else: 177 raise ValueError("unsupported operand type", op) 178 179def _compile_charset(charset, flags, code, fixup=None): 180 # compile charset subprogram 181 emit = code.append 182 if fixup is None: 183 fixup = _identityfunction 184 for op, av in _optimize_charset(charset, fixup): 185 emit(OPCODES[op]) 186 if op is NEGATE: 187 pass 188 elif op is LITERAL: 189 emit(fixup(av)) 190 elif op is RANGE: 191 emit(fixup(av[0])) 192 emit(fixup(av[1])) 193 elif op is CHARSET: 194 code.extend(av) 195 elif op is BIGCHARSET: 196 code.extend(av) 197 elif op is CATEGORY: 198 if flags & SRE_FLAG_LOCALE: 199 emit(CHCODES[CH_LOCALE[av]]) 200 elif flags & SRE_FLAG_UNICODE: 201 emit(CHCODES[CH_UNICODE[av]]) 202 else: 203 emit(CHCODES[av]) 204 else: 205 raise error("internal: unsupported set operator") 206 emit(OPCODES[FAILURE]) 207 208def _optimize_charset(charset, fixup): 209 # internal: optimize character set 210 out = [] 211 outappend = out.append 212 charmap = [0]*256 213 try: 214 for op, av in charset: 215 if op is NEGATE: 216 outappend((op, av)) 217 elif op is LITERAL: 218 charmap[fixup(av)] = 1 219 elif op is RANGE: 220 for i in range(fixup(av[0]), fixup(av[1])+1): 221 charmap[i] = 1 222 elif op is CATEGORY: 223 # XXX: could append to charmap tail 224 return charset # cannot compress 225 except IndexError: 226 # character set contains unicode characters 227 return _optimize_unicode(charset, fixup) 228 # compress character map 229 i = p = n = 0 230 runs = [] 231 runsappend = runs.append 232 for c in charmap: 233 if c: 234 if n == 0: 235 p = i 236 n = n + 1 237 elif n: 238 runsappend((p, n)) 239 n = 0 240 i = i + 1 241 if n: 242 runsappend((p, n)) 243 if len(runs) <= 2: 244 # use literal/range 245 for p, n in runs: 246 if n == 1: 247 outappend((LITERAL, p)) 248 else: 249 outappend((RANGE, (p, p+n-1))) 250 if len(out) < len(charset): 251 return out 252 else: 253 # use bitmap 254 data = _mk_bitmap(charmap) 255 outappend((CHARSET, data)) 256 return out 257 return charset 258 259def _mk_bitmap(bits): 260 data = [] 261 dataappend = data.append 262 if _sre.CODESIZE == 2: 263 start = (1, 0) 264 else: 265 start = (1, 0) 266 m, v = start 267 for c in bits: 268 if c: 269 v = v + m 270 m = m + m 271 if m > MAXCODE: 272 dataappend(v) 273 m, v = start 274 return data 275 276# To represent a big charset, first a bitmap of all characters in the 277# set is constructed. Then, this bitmap is sliced into chunks of 256 278# characters, duplicate chunks are eliminated, and each chunk is 279# given a number. In the compiled expression, the charset is 280# represented by a 16-bit word sequence, consisting of one word for 281# the number of different chunks, a sequence of 256 bytes (128 words) 282# of chunk numbers indexed by their original chunk position, and a 283# sequence of chunks (16 words each). 284 285# Compression is normally good: in a typical charset, large ranges of 286# Unicode will be either completely excluded (e.g. if only cyrillic 287# letters are to be matched), or completely included (e.g. if large 288# subranges of Kanji match). These ranges will be represented by 289# chunks of all one-bits or all zero-bits. 290 291# Matching can be also done efficiently: the more significant byte of 292# the Unicode character is an index into the chunk number, and the 293# less significant byte is a bit index in the chunk (just like the 294# CHARSET matching). 295 296# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 297# of the basic multilingual plane; an efficient representation 298# for all of UTF-16 has not yet been developed. This means, 299# in particular, that negated charsets cannot be represented as 300# bigcharsets. 301 302def _optimize_unicode(charset, fixup): 303 try: 304 import array 305 except ImportError: 306 return charset 307 charmap = [0]*65536 308 negate = 0 309 try: 310 for op, av in charset: 311 if op is NEGATE: 312 negate = 1 313 elif op is LITERAL: 314 charmap[fixup(av)] = 1 315 elif op is RANGE: 316 for i in range(fixup(av[0]), fixup(av[1])+1): 317 charmap[i] = 1 318 elif op is CATEGORY: 319 # XXX: could expand category 320 return charset # cannot compress 321 except IndexError: 322 # non-BMP characters; XXX now they should work 323 return charset 324 if negate: 325 if sys.maxunicode != 65535: 326 # XXX: negation does not work with big charsets 327 # XXX2: now they should work, but removing this will make the 328 # charmap 17 times bigger 329 return charset 330 for i in range(65536): 331 charmap[i] = not charmap[i] 332 comps = {} 333 mapping = [0]*256 334 block = 0 335 data = [] 336 for i in range(256): 337 chunk = tuple(charmap[i*256:(i+1)*256]) 338 new = comps.setdefault(chunk, block) 339 mapping[i] = new 340 if new == block: 341 block = block + 1 342 data = data + _mk_bitmap(chunk) 343 header = [block] 344 if _sre.CODESIZE == 2: 345 code = 'H' 346 else: 347 code = 'I' 348 # Convert block indices to byte array of 256 bytes 349 mapping = array.array('b', mapping).tobytes() 350 # Convert byte array to word array 351 mapping = array.array(code, mapping) 352 assert mapping.itemsize == _sre.CODESIZE 353 assert len(mapping) * mapping.itemsize == 256 354 header = header + mapping.tolist() 355 data[0:0] = header 356 return [(BIGCHARSET, data)] 357 358def _simple(av): 359 # check if av is a "simple" operator 360 lo, hi = av[2].getwidth() 361 #if lo == 0 and hi == MAXREPEAT: 362 # raise error("nothing to repeat") 363 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 364 365def _compile_info(code, pattern, flags): 366 # internal: compile an info block. in the current version, 367 # this contains min/max pattern width, and an optional literal 368 # prefix or a character map 369 lo, hi = pattern.getwidth() 370 if lo == 0: 371 return # not worth it 372 # look for a literal prefix 373 prefix = [] 374 prefixappend = prefix.append 375 prefix_skip = 0 376 charset = [] # not used 377 charsetappend = charset.append 378 if not (flags & SRE_FLAG_IGNORECASE): 379 # look for literal prefix 380 for op, av in pattern.data: 381 if op is LITERAL: 382 if len(prefix) == prefix_skip: 383 prefix_skip = prefix_skip + 1 384 prefixappend(av) 385 elif op is SUBPATTERN and len(av[1]) == 1: 386 op, av = av[1][0] 387 if op is LITERAL: 388 prefixappend(av) 389 else: 390 break 391 else: 392 break 393 # if no prefix, look for charset prefix 394 if not prefix and pattern.data: 395 op, av = pattern.data[0] 396 if op is SUBPATTERN and av[1]: 397 op, av = av[1][0] 398 if op is LITERAL: 399 charsetappend((op, av)) 400 elif op is BRANCH: 401 c = [] 402 cappend = c.append 403 for p in av[1]: 404 if not p: 405 break 406 op, av = p[0] 407 if op is LITERAL: 408 cappend((op, av)) 409 else: 410 break 411 else: 412 charset = c 413 elif op is BRANCH: 414 c = [] 415 cappend = c.append 416 for p in av[1]: 417 if not p: 418 break 419 op, av = p[0] 420 if op is LITERAL: 421 cappend((op, av)) 422 else: 423 break 424 else: 425 charset = c 426 elif op is IN: 427 charset = av 428## if prefix: 429## print "*** PREFIX", prefix, prefix_skip 430## if charset: 431## print "*** CHARSET", charset 432 # add an info block 433 emit = code.append 434 emit(OPCODES[INFO]) 435 skip = len(code); emit(0) 436 # literal flag 437 mask = 0 438 if prefix: 439 mask = SRE_INFO_PREFIX 440 if len(prefix) == prefix_skip == len(pattern.data): 441 mask = mask + SRE_INFO_LITERAL 442 elif charset: 443 mask = mask + SRE_INFO_CHARSET 444 emit(mask) 445 # pattern length 446 if lo < MAXCODE: 447 emit(lo) 448 else: 449 emit(MAXCODE) 450 prefix = prefix[:MAXCODE] 451 if hi < MAXCODE: 452 emit(hi) 453 else: 454 emit(0) 455 # add literal prefix 456 if prefix: 457 emit(len(prefix)) # length 458 emit(prefix_skip) # skip 459 code.extend(prefix) 460 # generate overlap table 461 table = [-1] + ([0]*len(prefix)) 462 for i in range(len(prefix)): 463 table[i+1] = table[i]+1 464 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 465 table[i+1] = table[table[i+1]-1]+1 466 code.extend(table[1:]) # don't store first entry 467 elif charset: 468 _compile_charset(charset, flags, code) 469 code[skip] = len(code) - skip 470 471def isstring(obj): 472 return isinstance(obj, (str, bytes)) 473 474def _code(p, flags): 475 476 flags = p.pattern.flags | flags 477 code = [] 478 479 # compile info block 480 _compile_info(code, p, flags) 481 482 # compile the pattern 483 _compile(code, p.data, flags) 484 485 code.append(OPCODES[SUCCESS]) 486 487 return code 488 489def compile(p, flags=0): 490 # internal: convert pattern list to internal format 491 492 if isstring(p): 493 pattern = p 494 p = sre_parse.parse(p, flags) 495 else: 496 pattern = None 497 498 code = _code(p, flags) 499 500 # print code 501 502 # XXX: <fl> get rid of this limitation! 503 if p.pattern.groups > 100: 504 raise AssertionError( 505 "sorry, but this version only supports 100 named groups" 506 ) 507 508 # map in either direction 509 groupindex = p.pattern.groupdict 510 indexgroup = [None] * p.pattern.groups 511 for k, i in groupindex.items(): 512 indexgroup[i] = k 513 514 return _sre.compile( 515 pattern, flags | p.pattern.flags, code, 516 p.pattern.groups-1, 517 groupindex, indexgroup 518 ) 519