10a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# 20a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Secret Labs' Regular Expression Engine 30a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# 40a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# convert template to internal format 50a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# 60a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 70a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# 80a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# See the sre.py file for information on usage and redistribution. 90a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# 100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao"""Internal support module for sre""" 120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 130a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport _sre, sys 140a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoimport sre_parse 150a8c90248264a8b26970b4473770bcc3df8515fJosh Gaofrom sre_constants import * 160a8c90248264a8b26970b4473770bcc3df8515fJosh Gaofrom _sre import MAXREPEAT 170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 180a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoassert _sre.MAGIC == MAGIC, "SRE module mismatch" 190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 200a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoif _sre.CODESIZE == 2: 210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao MAXCODE = 65535 220a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoelse: 230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao MAXCODE = 0xFFFFFFFFL 240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 250a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _identityfunction(x): 260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return x 270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao_SUCCESS_CODES = set([SUCCESS, FAILURE]) 310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 330a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _compile(code, pattern, flags): 340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # internal: compile a (sub)pattern 350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit = code.append 360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _len = len 370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao LITERAL_CODES = _LITERAL_CODES 380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao REPEATING_CODES = _REPEATING_CODES 390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao SUCCESS_CODES = _SUCCESS_CODES 400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ASSERT_CODES = _ASSERT_CODES 410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for op, av in pattern: 420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op in LITERAL_CODES: 430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_IGNORECASE: 440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[OP_IGNORE[op]]) 450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(_sre.getlower(av, flags)) 460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av) 490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is IN: 500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_IGNORECASE: 510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[OP_IGNORE[op]]) 520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao def fixup(literal, flags=flags): 530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return _sre.getlower(literal, flags) 540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao fixup = _identityfunction 570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile_charset(av, flags, code, fixup) 590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is ANY: 610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_DOTALL: 620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[ANY_ALL]) 630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[ANY]) 650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op in REPEATING_CODES: 660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_TEMPLATE: 670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise error, "internal: unsupported template operator" 680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[REPEAT]) 690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[0]) 710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[1]) 720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[2], flags) 730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[SUCCESS]) 740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif _simple(av) and op is not REPEAT: 760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is MAX_REPEAT: 770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[REPEAT_ONE]) 780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[MIN_REPEAT_ONE]) 800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[0]) 820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[1]) 830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[2], flags) 840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[SUCCESS]) 850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[REPEAT]) 880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[0]) 900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[1]) 910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[2], flags) 920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is MAX_REPEAT: 940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[MAX_UNTIL]) 950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[MIN_UNTIL]) 970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is SUBPATTERN: 980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if av[0]: 990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[MARK]) 1000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit((av[0]-1)*2) 1010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # _compile_info(code, av[1], flags) 1020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[1], flags) 1030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if av[0]: 1040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[MARK]) 1050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit((av[0]-1)*2+1) 1060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op in SUCCESS_CODES: 1070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op in ASSERT_CODES: 1090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 1110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if av[0] >= 0: 1120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(0) # look ahead 1130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao lo, hi = av[1].getwidth() 1150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if lo != hi: 1160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise error, "look-behind requires fixed-width pattern" 1170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(lo) # look behind 1180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[1], flags) 1190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[SUCCESS]) 1200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 1210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is CALL: 1220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 1240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av, flags) 1250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[SUCCESS]) 1260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 1270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is AT: 1280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_MULTILINE: 1300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao av = AT_MULTILINE.get(av, av) 1310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_LOCALE: 1320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao av = AT_LOCALE.get(av, av) 1330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif flags & SRE_FLAG_UNICODE: 1340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao av = AT_UNICODE.get(av, av) 1350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(ATCODES[av]) 1360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is BRANCH: 1370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tail = [] 1390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tailappend = tail.append 1400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for av in av[1]: 1410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = _len(code); emit(0) 1420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # _compile_info(code, av, flags) 1430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av, flags) 1440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[JUMP]) 1450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao tailappend(_len(code)); emit(0) 1460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = _len(code) - skip 1470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(0) # end of branch 1480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for tail in tail: 1490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[tail] = _len(code) - tail 1500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is CATEGORY: 1510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_LOCALE: 1530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao av = CH_LOCALE[av] 1540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif flags & SRE_FLAG_UNICODE: 1550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao av = CH_UNICODE[av] 1560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(CHCODES[av]) 1570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is GROUPREF: 1580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_IGNORECASE: 1590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[OP_IGNORE[op]]) 1600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av-1) 1630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is GROUPREF_EXISTS: 1640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(av[0]-1) 1660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skipyes = _len(code); emit(0) 1670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[1], flags) 1680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if av[2]: 1690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[JUMP]) 1700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skipno = _len(code); emit(0) 1710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skipyes] = _len(code) - skipyes + 1 1720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, av[2], flags) 1730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skipno] = _len(code) - skipno 1740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skipyes] = _len(code) - skipyes + 1 1760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 1770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise ValueError, ("unsupported operand type", op) 1780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 1790a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _compile_charset(charset, flags, code, fixup=None): 1800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # compile charset subprogram 1810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit = code.append 1820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if fixup is None: 1830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao fixup = _identityfunction 1840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for op, av in _optimize_charset(charset, fixup): 1850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[op]) 1860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is NEGATE: 1870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pass 1880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is LITERAL: 1890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(fixup(av)) 1900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is RANGE: 1910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(fixup(av[0])) 1920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(fixup(av[1])) 1930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is CHARSET: 1940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code.extend(av) 1950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is BIGCHARSET: 1960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code.extend(av) 1970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is CATEGORY: 1980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if flags & SRE_FLAG_LOCALE: 1990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(CHCODES[CH_LOCALE[av]]) 2000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif flags & SRE_FLAG_UNICODE: 2010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(CHCODES[CH_UNICODE[av]]) 2020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(CHCODES[av]) 2040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise error, "internal: unsupported set operator" 2060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[FAILURE]) 2070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2080a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _optimize_charset(charset, fixup): 2090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # internal: optimize character set 2100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao out = [] 2110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao outappend = out.append 2120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap = [0]*256 2130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 2140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for op, av in charset: 2150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is NEGATE: 2160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao outappend((op, av)) 2170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is LITERAL: 2180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap[fixup(av)] = 1 2190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is RANGE: 2200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for i in range(fixup(av[0]), fixup(av[1])+1): 2210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap[i] = 1 2220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is CATEGORY: 2230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX: could append to charmap tail 2240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return charset # cannot compress 2250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except IndexError: 2260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # character set contains unicode characters 2270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return _optimize_unicode(charset, fixup) 2280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # compress character map 2290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = p = n = 0 2300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao runs = [] 2310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao runsappend = runs.append 2320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for c in charmap: 2330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if c: 2340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if n == 0: 2350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao p = i 2360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao n = n + 1 2370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif n: 2380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao runsappend((p, n)) 2390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao n = 0 2400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao i = i + 1 2410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if n: 2420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao runsappend((p, n)) 2430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(runs) <= 2: 2440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # use literal/range 2450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for p, n in runs: 2460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if n == 1: 2470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao outappend((LITERAL, p)) 2480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao outappend((RANGE, (p, p+n-1))) 2500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(out) < len(charset): 2510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return out 2520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # use bitmap 2540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = _mk_bitmap(charmap) 2550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao outappend((CHARSET, data)) 2560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return out 2570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return charset 2580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2590a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _mk_bitmap(bits): 2600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = [] 2610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao dataappend = data.append 2620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if _sre.CODESIZE == 2: 2630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao start = (1, 0) 2640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 2650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao start = (1L, 0L) 2660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao m, v = start 2670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for c in bits: 2680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if c: 2690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao v = v + m 2700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao m = m + m 2710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if m > MAXCODE: 2720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao dataappend(v) 2730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao m, v = start 2740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return data 2750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# To represent a big charset, first a bitmap of all characters in the 2770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# set is constructed. Then, this bitmap is sliced into chunks of 256 2780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# characters, duplicate chunks are eliminated, and each chunk is 2790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# given a number. In the compiled expression, the charset is 2800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# represented by a 16-bit word sequence, consisting of one word for 2810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# the number of different chunks, a sequence of 256 bytes (128 words) 2820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# of chunk numbers indexed by their original chunk position, and a 2830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# sequence of chunks (16 words each). 2840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Compression is normally good: in a typical charset, large ranges of 2860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Unicode will be either completely excluded (e.g. if only cyrillic 2870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# letters are to be matched), or completely included (e.g. if large 2880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# subranges of Kanji match). These ranges will be represented by 2890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# chunks of all one-bits or all zero-bits. 2900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# Matching can be also done efficiently: the more significant byte of 2920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# the Unicode character is an index into the chunk number, and the 2930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# less significant byte is a bit index in the chunk (just like the 2940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# CHARSET matching). 2950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 2960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 2970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# of the basic multilingual plane; an efficient representation 2980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# for all of UTF-16 has not yet been developed. This means, 2990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# in particular, that negated charsets cannot be represented as 3000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao# bigcharsets. 3010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3020a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _optimize_unicode(charset, fixup): 3030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao import array 3050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except ImportError: 3060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return charset 3070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap = [0]*65536 3080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao negate = 0 3090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao try: 3100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for op, av in charset: 3110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is NEGATE: 3120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao negate = 1 3130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is LITERAL: 3140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap[fixup(av)] = 1 3150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is RANGE: 3160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for i in xrange(fixup(av[0]), fixup(av[1])+1): 3170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap[i] = 1 3180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is CATEGORY: 3190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX: could expand category 3200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return charset # cannot compress 3210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao except IndexError: 3220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # non-BMP characters 3230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return charset 3240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if negate: 3250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if sys.maxunicode != 65535: 3260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX: negation does not work with big charsets 3270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return charset 3280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for i in xrange(65536): 3290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charmap[i] = not charmap[i] 3300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao comps = {} 3310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mapping = [0]*256 3320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao block = 0 3330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = [] 3340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for i in xrange(256): 3350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao chunk = tuple(charmap[i*256:(i+1)*256]) 3360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao new = comps.setdefault(chunk, block) 3370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mapping[i] = new 3380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if new == block: 3390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao block = block + 1 3400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data = data + _mk_bitmap(chunk) 3410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao header = [block] 3420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if _sre.CODESIZE == 2: 3430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code = 'H' 3440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code = 'I' 3460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Convert block indices to byte array of 256 bytes 3470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mapping = array.array('b', mapping).tostring() 3480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # Convert byte array to word array 3490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mapping = array.array(code, mapping) 3500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao assert mapping.itemsize == _sre.CODESIZE 3510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao header = header + mapping.tolist() 3520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao data[0:0] = header 3530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return [(BIGCHARSET, data)] 3540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3550a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _simple(av): 3560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # check if av is a "simple" operator 3570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao lo, hi = av[2].getwidth() 3580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if lo == 0 and hi == MAXREPEAT: 3590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise error, "nothing to repeat" 3600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return lo == hi == 1 and av[2][0][0] != SUBPATTERN 3610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 3620a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _compile_info(code, pattern, flags): 3630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # internal: compile an info block. in the current version, 3640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # this contains min/max pattern width, and an optional literal 3650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # prefix or a character map 3660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao lo, hi = pattern.getwidth() 3670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if lo == 0: 3680a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return # not worth it 3690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # look for a literal prefix 3700a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefix = [] 3710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefixappend = prefix.append 3720a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefix_skip = 0 3730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charset = [] # not used 3740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charsetappend = charset.append 3750a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not (flags & SRE_FLAG_IGNORECASE): 3760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # look for literal prefix 3770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for op, av in pattern.data: 3780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is LITERAL: 3790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(prefix) == prefix_skip: 3800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefix_skip = prefix_skip + 1 3810a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefixappend(av) 3820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is SUBPATTERN and len(av[1]) == 1: 3830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao op, av = av[1][0] 3840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is LITERAL: 3850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefixappend(av) 3860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 3880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 3890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 3900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # if no prefix, look for charset prefix 3910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not prefix and pattern.data: 3920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao op, av = pattern.data[0] 3930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is SUBPATTERN and av[1]: 3940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao op, av = av[1][0] 3950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is LITERAL: 3960a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charsetappend((op, av)) 3970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is BRANCH: 3980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao c = [] 3990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao cappend = c.append 4000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for p in av[1]: 4010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not p: 4020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 4030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao op, av = p[0] 4040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is LITERAL: 4050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao cappend((op, av)) 4060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 4080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charset = c 4100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is BRANCH: 4110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao c = [] 4120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao cappend = c.append 4130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for p in av[1]: 4140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if not p: 4150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 4160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao op, av = p[0] 4170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if op is LITERAL: 4180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao cappend((op, av)) 4190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao break 4210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charset = c 4230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif op is IN: 4240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao charset = av 4250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao## if prefix: 4260a8c90248264a8b26970b4473770bcc3df8515fJosh Gao## print "*** PREFIX", prefix, prefix_skip 4270a8c90248264a8b26970b4473770bcc3df8515fJosh Gao## if charset: 4280a8c90248264a8b26970b4473770bcc3df8515fJosh Gao## print "*** CHARSET", charset 4290a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # add an info block 4300a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit = code.append 4310a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(OPCODES[INFO]) 4320a8c90248264a8b26970b4473770bcc3df8515fJosh Gao skip = len(code); emit(0) 4330a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # literal flag 4340a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mask = 0 4350a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if prefix: 4360a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mask = SRE_INFO_PREFIX 4370a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if len(prefix) == prefix_skip == len(pattern.data): 4380a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mask = mask + SRE_INFO_LITERAL 4390a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif charset: 4400a8c90248264a8b26970b4473770bcc3df8515fJosh Gao mask = mask + SRE_INFO_CHARSET 4410a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(mask) 4420a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # pattern length 4430a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if lo < MAXCODE: 4440a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(lo) 4450a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4460a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(MAXCODE) 4470a8c90248264a8b26970b4473770bcc3df8515fJosh Gao prefix = prefix[:MAXCODE] 4480a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if hi < MAXCODE: 4490a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(hi) 4500a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 4510a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(0) 4520a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # add literal prefix 4530a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if prefix: 4540a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(len(prefix)) # length 4550a8c90248264a8b26970b4473770bcc3df8515fJosh Gao emit(prefix_skip) # skip 4560a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code.extend(prefix) 4570a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # generate overlap table 4580a8c90248264a8b26970b4473770bcc3df8515fJosh Gao table = [-1] + ([0]*len(prefix)) 4590a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for i in xrange(len(prefix)): 4600a8c90248264a8b26970b4473770bcc3df8515fJosh Gao table[i+1] = table[i]+1 4610a8c90248264a8b26970b4473770bcc3df8515fJosh Gao while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 4620a8c90248264a8b26970b4473770bcc3df8515fJosh Gao table[i+1] = table[table[i+1]-1]+1 4630a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code.extend(table[1:]) # don't store first entry 4640a8c90248264a8b26970b4473770bcc3df8515fJosh Gao elif charset: 4650a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile_charset(charset, flags, code) 4660a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code[skip] = len(code) - skip 4670a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4680a8c90248264a8b26970b4473770bcc3df8515fJosh Gaotry: 4690a8c90248264a8b26970b4473770bcc3df8515fJosh Gao unicode 4700a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoexcept NameError: 4710a8c90248264a8b26970b4473770bcc3df8515fJosh Gao STRING_TYPES = (type(""),) 4720a8c90248264a8b26970b4473770bcc3df8515fJosh Gaoelse: 4730a8c90248264a8b26970b4473770bcc3df8515fJosh Gao STRING_TYPES = (type(""), type(unicode(""))) 4740a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4750a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef isstring(obj): 4760a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for tp in STRING_TYPES: 4770a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if isinstance(obj, tp): 4780a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 1 4790a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return 0 4800a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4810a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef _code(p, flags): 4820a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4830a8c90248264a8b26970b4473770bcc3df8515fJosh Gao flags = p.pattern.flags | flags 4840a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code = [] 4850a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4860a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # compile info block 4870a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile_info(code, p, flags) 4880a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4890a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # compile the pattern 4900a8c90248264a8b26970b4473770bcc3df8515fJosh Gao _compile(code, p.data, flags) 4910a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4920a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code.append(OPCODES[SUCCESS]) 4930a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4940a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return code 4950a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4960a8c90248264a8b26970b4473770bcc3df8515fJosh Gaodef compile(p, flags=0): 4970a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # internal: convert pattern list to internal format 4980a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 4990a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if isstring(p): 5000a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pattern = p 5010a8c90248264a8b26970b4473770bcc3df8515fJosh Gao p = sre_parse.parse(p, flags) 5020a8c90248264a8b26970b4473770bcc3df8515fJosh Gao else: 5030a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pattern = None 5040a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5050a8c90248264a8b26970b4473770bcc3df8515fJosh Gao code = _code(p, flags) 5060a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5070a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # print code 5080a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5090a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # XXX: <fl> get rid of this limitation! 5100a8c90248264a8b26970b4473770bcc3df8515fJosh Gao if p.pattern.groups > 100: 5110a8c90248264a8b26970b4473770bcc3df8515fJosh Gao raise AssertionError( 5120a8c90248264a8b26970b4473770bcc3df8515fJosh Gao "sorry, but this version only supports 100 named groups" 5130a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ) 5140a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5150a8c90248264a8b26970b4473770bcc3df8515fJosh Gao # map in either direction 5160a8c90248264a8b26970b4473770bcc3df8515fJosh Gao groupindex = p.pattern.groupdict 5170a8c90248264a8b26970b4473770bcc3df8515fJosh Gao indexgroup = [None] * p.pattern.groups 5180a8c90248264a8b26970b4473770bcc3df8515fJosh Gao for k, i in groupindex.items(): 5190a8c90248264a8b26970b4473770bcc3df8515fJosh Gao indexgroup[i] = k 5200a8c90248264a8b26970b4473770bcc3df8515fJosh Gao 5210a8c90248264a8b26970b4473770bcc3df8515fJosh Gao return _sre.compile( 5220a8c90248264a8b26970b4473770bcc3df8515fJosh Gao pattern, flags | p.pattern.flags, code, 5230a8c90248264a8b26970b4473770bcc3df8515fJosh Gao p.pattern.groups-1, 5240a8c90248264a8b26970b4473770bcc3df8515fJosh Gao groupindex, indexgroup 5250a8c90248264a8b26970b4473770bcc3df8515fJosh Gao ) 526