sre_compile.py revision 83e802796c80f46be616b48020356f7f51be533d
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre 14import sre_parse 15from sre_constants import * 16 17assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19_LITERAL_CODES = {LITERAL, NOT_LITERAL} 20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} 21_SUCCESS_CODES = {SUCCESS, FAILURE} 22_ASSERT_CODES = {ASSERT, ASSERT_NOT} 23 24# Sets of lowercase characters which have the same uppercase. 25_equivalences = ( 26 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I 27 (0x69, 0x131), # iı 28 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S 29 (0x73, 0x17f), # sſ 30 # MICRO SIGN, GREEK SMALL LETTER MU 31 (0xb5, 0x3bc), # µμ 32 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI 33 (0x345, 0x3b9, 0x1fbe), # \u0345ιι 34 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 35 (0x390, 0x1fd3), # ΐΐ 36 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 37 (0x3b0, 0x1fe3), # ΰΰ 38 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL 39 (0x3b2, 0x3d0), # βϐ 40 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL 41 (0x3b5, 0x3f5), # εϵ 42 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL 43 (0x3b8, 0x3d1), # θϑ 44 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL 45 (0x3ba, 0x3f0), # κϰ 46 # GREEK SMALL LETTER PI, GREEK PI SYMBOL 47 (0x3c0, 0x3d6), # πϖ 48 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL 49 (0x3c1, 0x3f1), # ρϱ 50 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA 51 (0x3c2, 0x3c3), # ςσ 52 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL 53 (0x3c6, 0x3d5), # φϕ 54 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE 55 (0x1e61, 0x1e9b), # ṡẛ 56 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST 57 (0xfb05, 0xfb06), # ſtst 58) 59 60# Maps the lowercase code to lowercase codes which have the same uppercase. 61_ignorecase_fixes = {i: tuple(j for j in t if i != j) 62 for t in _equivalences for i in t} 63 64def _compile(code, pattern, flags): 65 # internal: compile a (sub)pattern 66 emit = code.append 67 _len = len 68 LITERAL_CODES = _LITERAL_CODES 69 REPEATING_CODES = _REPEATING_CODES 70 SUCCESS_CODES = _SUCCESS_CODES 71 ASSERT_CODES = _ASSERT_CODES 72 if (flags & SRE_FLAG_IGNORECASE and 73 not (flags & SRE_FLAG_LOCALE) and 74 flags & SRE_FLAG_UNICODE): 75 fixes = _ignorecase_fixes 76 else: 77 fixes = None 78 for op, av in pattern: 79 if op in LITERAL_CODES: 80 if flags & SRE_FLAG_IGNORECASE: 81 lo = _sre.getlower(av, flags) 82 if fixes and lo in fixes: 83 emit(IN_IGNORE) 84 skip = _len(code); emit(0) 85 if op is NOT_LITERAL: 86 emit(NEGATE) 87 for k in (lo,) + fixes[lo]: 88 emit(LITERAL) 89 emit(k) 90 emit(FAILURE) 91 code[skip] = _len(code) - skip 92 else: 93 emit(OP_IGNORE[op]) 94 emit(lo) 95 else: 96 emit(op) 97 emit(av) 98 elif op is IN: 99 if flags & SRE_FLAG_IGNORECASE: 100 emit(OP_IGNORE[op]) 101 def fixup(literal, flags=flags): 102 return _sre.getlower(literal, flags) 103 else: 104 emit(op) 105 fixup = None 106 skip = _len(code); emit(0) 107 _compile_charset(av, flags, code, fixup, fixes) 108 code[skip] = _len(code) - skip 109 elif op is ANY: 110 if flags & SRE_FLAG_DOTALL: 111 emit(ANY_ALL) 112 else: 113 emit(ANY) 114 elif op in REPEATING_CODES: 115 if flags & SRE_FLAG_TEMPLATE: 116 raise error("internal: unsupported template operator") 117 elif _simple(av) and op is not REPEAT: 118 if op is MAX_REPEAT: 119 emit(REPEAT_ONE) 120 else: 121 emit(MIN_REPEAT_ONE) 122 skip = _len(code); emit(0) 123 emit(av[0]) 124 emit(av[1]) 125 _compile(code, av[2], flags) 126 emit(SUCCESS) 127 code[skip] = _len(code) - skip 128 else: 129 emit(REPEAT) 130 skip = _len(code); emit(0) 131 emit(av[0]) 132 emit(av[1]) 133 _compile(code, av[2], flags) 134 code[skip] = _len(code) - skip 135 if op is MAX_REPEAT: 136 emit(MAX_UNTIL) 137 else: 138 emit(MIN_UNTIL) 139 elif op is SUBPATTERN: 140 if av[0]: 141 emit(MARK) 142 emit((av[0]-1)*2) 143 # _compile_info(code, av[1], flags) 144 _compile(code, av[1], flags) 145 if av[0]: 146 emit(MARK) 147 emit((av[0]-1)*2+1) 148 elif op in SUCCESS_CODES: 149 emit(op) 150 elif op in ASSERT_CODES: 151 emit(op) 152 skip = _len(code); emit(0) 153 if av[0] >= 0: 154 emit(0) # look ahead 155 else: 156 lo, hi = av[1].getwidth() 157 if lo != hi: 158 raise error("look-behind requires fixed-width pattern") 159 emit(lo) # look behind 160 _compile(code, av[1], flags) 161 emit(SUCCESS) 162 code[skip] = _len(code) - skip 163 elif op is CALL: 164 emit(op) 165 skip = _len(code); emit(0) 166 _compile(code, av, flags) 167 emit(SUCCESS) 168 code[skip] = _len(code) - skip 169 elif op is AT: 170 emit(op) 171 if flags & SRE_FLAG_MULTILINE: 172 av = AT_MULTILINE.get(av, av) 173 if flags & SRE_FLAG_LOCALE: 174 av = AT_LOCALE.get(av, av) 175 elif flags & SRE_FLAG_UNICODE: 176 av = AT_UNICODE.get(av, av) 177 emit(av) 178 elif op is BRANCH: 179 emit(op) 180 tail = [] 181 tailappend = tail.append 182 for av in av[1]: 183 skip = _len(code); emit(0) 184 # _compile_info(code, av, flags) 185 _compile(code, av, flags) 186 emit(JUMP) 187 tailappend(_len(code)); emit(0) 188 code[skip] = _len(code) - skip 189 emit(FAILURE) # end of branch 190 for tail in tail: 191 code[tail] = _len(code) - tail 192 elif op is CATEGORY: 193 emit(op) 194 if flags & SRE_FLAG_LOCALE: 195 av = CH_LOCALE[av] 196 elif flags & SRE_FLAG_UNICODE: 197 av = CH_UNICODE[av] 198 emit(av) 199 elif op is GROUPREF: 200 if flags & SRE_FLAG_IGNORECASE: 201 emit(OP_IGNORE[op]) 202 else: 203 emit(op) 204 emit(av-1) 205 elif op is GROUPREF_EXISTS: 206 emit(op) 207 emit(av[0]-1) 208 skipyes = _len(code); emit(0) 209 _compile(code, av[1], flags) 210 if av[2]: 211 emit(JUMP) 212 skipno = _len(code); emit(0) 213 code[skipyes] = _len(code) - skipyes + 1 214 _compile(code, av[2], flags) 215 code[skipno] = _len(code) - skipno 216 else: 217 code[skipyes] = _len(code) - skipyes + 1 218 else: 219 raise ValueError("unsupported operand type", op) 220 221def _compile_charset(charset, flags, code, fixup=None, fixes=None): 222 # compile charset subprogram 223 emit = code.append 224 for op, av in _optimize_charset(charset, fixup, fixes): 225 emit(op) 226 if op is NEGATE: 227 pass 228 elif op is LITERAL: 229 emit(av) 230 elif op is RANGE or op is RANGE_IGNORE: 231 emit(av[0]) 232 emit(av[1]) 233 elif op is CHARSET: 234 code.extend(av) 235 elif op is BIGCHARSET: 236 code.extend(av) 237 elif op is CATEGORY: 238 if flags & SRE_FLAG_LOCALE: 239 emit(CH_LOCALE[av]) 240 elif flags & SRE_FLAG_UNICODE: 241 emit(CH_UNICODE[av]) 242 else: 243 emit(av) 244 else: 245 raise error("internal: unsupported set operator") 246 emit(FAILURE) 247 248def _optimize_charset(charset, fixup, fixes): 249 # internal: optimize character set 250 out = [] 251 tail = [] 252 charmap = bytearray(256) 253 for op, av in charset: 254 while True: 255 try: 256 if op is LITERAL: 257 if fixup: 258 lo = fixup(av) 259 charmap[lo] = 1 260 if fixes and lo in fixes: 261 for k in fixes[lo]: 262 charmap[k] = 1 263 else: 264 charmap[av] = 1 265 elif op is RANGE: 266 r = range(av[0], av[1]+1) 267 if fixup: 268 r = map(fixup, r) 269 if fixup and fixes: 270 for i in r: 271 charmap[i] = 1 272 if i in fixes: 273 for k in fixes[i]: 274 charmap[k] = 1 275 else: 276 for i in r: 277 charmap[i] = 1 278 elif op is NEGATE: 279 out.append((op, av)) 280 else: 281 tail.append((op, av)) 282 except IndexError: 283 if len(charmap) == 256: 284 # character set contains non-UCS1 character codes 285 charmap += b'\0' * 0xff00 286 continue 287 # Character set contains non-BMP character codes. 288 # There are only two ranges of cased non-BMP characters: 289 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), 290 # and for both ranges RANGE_IGNORE works. 291 if fixup and op is RANGE: 292 op = RANGE_IGNORE 293 tail.append((op, av)) 294 break 295 296 # compress character map 297 runs = [] 298 q = 0 299 while True: 300 p = charmap.find(1, q) 301 if p < 0: 302 break 303 if len(runs) >= 2: 304 runs = None 305 break 306 q = charmap.find(0, p) 307 if q < 0: 308 runs.append((p, len(charmap))) 309 break 310 runs.append((p, q)) 311 if runs is not None: 312 # use literal/range 313 for p, q in runs: 314 if q - p == 1: 315 out.append((LITERAL, p)) 316 else: 317 out.append((RANGE, (p, q - 1))) 318 out += tail 319 # if the case was changed or new representation is more compact 320 if fixup or len(out) < len(charset): 321 return out 322 # else original character set is good enough 323 return charset 324 325 # use bitmap 326 if len(charmap) == 256: 327 data = _mk_bitmap(charmap) 328 out.append((CHARSET, data)) 329 out += tail 330 return out 331 332 # To represent a big charset, first a bitmap of all characters in the 333 # set is constructed. Then, this bitmap is sliced into chunks of 256 334 # characters, duplicate chunks are eliminated, and each chunk is 335 # given a number. In the compiled expression, the charset is 336 # represented by a 32-bit word sequence, consisting of one word for 337 # the number of different chunks, a sequence of 256 bytes (64 words) 338 # of chunk numbers indexed by their original chunk position, and a 339 # sequence of 256-bit chunks (8 words each). 340 341 # Compression is normally good: in a typical charset, large ranges of 342 # Unicode will be either completely excluded (e.g. if only cyrillic 343 # letters are to be matched), or completely included (e.g. if large 344 # subranges of Kanji match). These ranges will be represented by 345 # chunks of all one-bits or all zero-bits. 346 347 # Matching can be also done efficiently: the more significant byte of 348 # the Unicode character is an index into the chunk number, and the 349 # less significant byte is a bit index in the chunk (just like the 350 # CHARSET matching). 351 352 charmap = bytes(charmap) # should be hashable 353 comps = {} 354 mapping = bytearray(256) 355 block = 0 356 data = bytearray() 357 for i in range(0, 65536, 256): 358 chunk = charmap[i: i + 256] 359 if chunk in comps: 360 mapping[i // 256] = comps[chunk] 361 else: 362 mapping[i // 256] = comps[chunk] = block 363 block += 1 364 data += chunk 365 data = _mk_bitmap(data) 366 data[0:0] = [block] + _bytes_to_codes(mapping) 367 out.append((BIGCHARSET, data)) 368 out += tail 369 return out 370 371_CODEBITS = _sre.CODESIZE * 8 372MAXCODE = (1 << _CODEBITS) - 1 373_BITS_TRANS = b'0' + b'1' * 255 374def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): 375 s = bits.translate(_BITS_TRANS)[::-1] 376 return [_int(s[i - _CODEBITS: i], 2) 377 for i in range(len(s), 0, -_CODEBITS)] 378 379def _bytes_to_codes(b): 380 # Convert block indices to word array 381 a = memoryview(b).cast('I') 382 assert a.itemsize == _sre.CODESIZE 383 assert len(a) * a.itemsize == len(b) 384 return a.tolist() 385 386def _simple(av): 387 # check if av is a "simple" operator 388 lo, hi = av[2].getwidth() 389 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 390 391def _generate_overlap_table(prefix): 392 """ 393 Generate an overlap table for the following prefix. 394 An overlap table is a table of the same size as the prefix which 395 informs about the potential self-overlap for each index in the prefix: 396 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] 397 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with 398 prefix[0:k] 399 """ 400 table = [0] * len(prefix) 401 for i in range(1, len(prefix)): 402 idx = table[i - 1] 403 while prefix[i] != prefix[idx]: 404 if idx == 0: 405 table[i] = 0 406 break 407 idx = table[idx - 1] 408 else: 409 table[i] = idx + 1 410 return table 411 412def _compile_info(code, pattern, flags): 413 # internal: compile an info block. in the current version, 414 # this contains min/max pattern width, and an optional literal 415 # prefix or a character map 416 lo, hi = pattern.getwidth() 417 if hi > MAXCODE: 418 hi = MAXCODE 419 if lo == 0: 420 code.extend([INFO, 4, 0, lo, hi]) 421 return 422 # look for a literal prefix 423 prefix = [] 424 prefixappend = prefix.append 425 prefix_skip = 0 426 charset = [] # not used 427 charsetappend = charset.append 428 if not (flags & SRE_FLAG_IGNORECASE): 429 # look for literal prefix 430 for op, av in pattern.data: 431 if op is LITERAL: 432 if len(prefix) == prefix_skip: 433 prefix_skip = prefix_skip + 1 434 prefixappend(av) 435 elif op is SUBPATTERN and len(av[1]) == 1: 436 op, av = av[1][0] 437 if op is LITERAL: 438 prefixappend(av) 439 else: 440 break 441 else: 442 break 443 # if no prefix, look for charset prefix 444 if not prefix and pattern.data: 445 op, av = pattern.data[0] 446 if op is SUBPATTERN and av[1]: 447 op, av = av[1][0] 448 if op is LITERAL: 449 charsetappend((op, av)) 450 elif op is BRANCH: 451 c = [] 452 cappend = c.append 453 for p in av[1]: 454 if not p: 455 break 456 op, av = p[0] 457 if op is LITERAL: 458 cappend((op, av)) 459 else: 460 break 461 else: 462 charset = c 463 elif op is BRANCH: 464 c = [] 465 cappend = c.append 466 for p in av[1]: 467 if not p: 468 break 469 op, av = p[0] 470 if op is LITERAL: 471 cappend((op, av)) 472 else: 473 break 474 else: 475 charset = c 476 elif op is IN: 477 charset = av 478## if prefix: 479## print("*** PREFIX", prefix, prefix_skip) 480## if charset: 481## print("*** CHARSET", charset) 482 # add an info block 483 emit = code.append 484 emit(INFO) 485 skip = len(code); emit(0) 486 # literal flag 487 mask = 0 488 if prefix: 489 mask = SRE_INFO_PREFIX 490 if len(prefix) == prefix_skip == len(pattern.data): 491 mask = mask | SRE_INFO_LITERAL 492 elif charset: 493 mask = mask | SRE_INFO_CHARSET 494 emit(mask) 495 # pattern length 496 if lo < MAXCODE: 497 emit(lo) 498 else: 499 emit(MAXCODE) 500 prefix = prefix[:MAXCODE] 501 emit(min(hi, MAXCODE)) 502 # add literal prefix 503 if prefix: 504 emit(len(prefix)) # length 505 emit(prefix_skip) # skip 506 code.extend(prefix) 507 # generate overlap table 508 code.extend(_generate_overlap_table(prefix)) 509 elif charset: 510 _compile_charset(charset, flags, code) 511 code[skip] = len(code) - skip 512 513def isstring(obj): 514 return isinstance(obj, (str, bytes)) 515 516def _code(p, flags): 517 518 flags = p.pattern.flags | flags 519 code = [] 520 521 # compile info block 522 _compile_info(code, p, flags) 523 524 # compile the pattern 525 _compile(code, p.data, flags) 526 527 code.append(SUCCESS) 528 529 return code 530 531def compile(p, flags=0): 532 # internal: convert pattern list to internal format 533 534 if isstring(p): 535 pattern = p 536 p = sre_parse.parse(p, flags) 537 else: 538 pattern = None 539 540 code = _code(p, flags) 541 542 # print(code) 543 544 # map in either direction 545 groupindex = p.pattern.groupdict 546 indexgroup = [None] * p.pattern.groups 547 for k, i in groupindex.items(): 548 indexgroup[i] = k 549 550 return _sre.compile( 551 pattern, flags | p.pattern.flags, code, 552 p.pattern.groups-1, 553 groupindex, indexgroup 554 ) 555