10c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# 20c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Secret Labs' Regular Expression Engine 30c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# 40c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# convert template to internal format 50c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# 60c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 70c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# 80c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# See the sre.py file for information on usage and redistribution. 90c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# 100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi"""Internal support module for sre""" 120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiimport _sre, sys 140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiimport sre_parse 150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yifrom sre_constants import * 160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yifrom _sre import MAXREPEAT 170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiassert _sre.MAGIC == MAGIC, "SRE module mismatch" 190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiif _sre.CODESIZE == 2: 210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi MAXCODE = 65535 220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yielse: 230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi MAXCODE = 0xFFFFFFFFL 240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _identityfunction(x): 260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return x 270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_SUCCESS_CODES = set([SUCCESS, FAILURE]) 310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _compile(code, pattern, flags): 340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # internal: compile a (sub)pattern 350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit = code.append 360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _len = len 370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi LITERAL_CODES = _LITERAL_CODES 380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi REPEATING_CODES = _REPEATING_CODES 390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi SUCCESS_CODES = _SUCCESS_CODES 400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi ASSERT_CODES = _ASSERT_CODES 410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for op, av in pattern: 420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op in LITERAL_CODES: 430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_IGNORECASE: 440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[OP_IGNORE[op]]) 450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(_sre.getlower(av, flags)) 460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av) 490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is IN: 500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_IGNORECASE: 510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[OP_IGNORE[op]]) 520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi def fixup(literal, flags=flags): 530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return _sre.getlower(literal, flags) 540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi fixup = _identityfunction 570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile_charset(av, flags, code, fixup) 590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is ANY: 610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_DOTALL: 620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[ANY_ALL]) 630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[ANY]) 650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op in REPEATING_CODES: 660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_TEMPLATE: 670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi raise error, "internal: unsupported template operator" 680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[REPEAT]) 690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[0]) 710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[1]) 720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[2], flags) 730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[SUCCESS]) 740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif _simple(av) and op is not REPEAT: 760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is MAX_REPEAT: 770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[REPEAT_ONE]) 780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[MIN_REPEAT_ONE]) 800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[0]) 820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[1]) 830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[2], flags) 840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[SUCCESS]) 850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[REPEAT]) 880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[0]) 900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[1]) 910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[2], flags) 920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is MAX_REPEAT: 940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[MAX_UNTIL]) 950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[MIN_UNTIL]) 970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is SUBPATTERN: 980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if av[0]: 990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[MARK]) 1000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit((av[0]-1)*2) 1010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # _compile_info(code, av[1], flags) 1020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[1], flags) 1030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if av[0]: 1040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[MARK]) 1050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit((av[0]-1)*2+1) 1060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op in SUCCESS_CODES: 1070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op in ASSERT_CODES: 1090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 1110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if av[0] >= 0: 1120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(0) # look ahead 1130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 1140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi lo, hi = av[1].getwidth() 1150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if lo != hi: 1160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi raise error, "look-behind requires fixed-width pattern" 1170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(lo) # look behind 1180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[1], flags) 1190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[SUCCESS]) 1200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 1210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is CALL: 1220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 1240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av, flags) 1250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[SUCCESS]) 1260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 1270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is AT: 1280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_MULTILINE: 1300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi av = AT_MULTILINE.get(av, av) 1310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_LOCALE: 1320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi av = AT_LOCALE.get(av, av) 1330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif flags & SRE_FLAG_UNICODE: 1340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi av = AT_UNICODE.get(av, av) 1350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(ATCODES[av]) 1360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is BRANCH: 1370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi tail = [] 1390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi tailappend = tail.append 1400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for av in av[1]: 1410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = _len(code); emit(0) 1420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # _compile_info(code, av, flags) 1430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av, flags) 1440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[JUMP]) 1450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi tailappend(_len(code)); emit(0) 1460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = _len(code) - skip 1470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(0) # end of branch 1480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for tail in tail: 1490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[tail] = _len(code) - tail 1500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is CATEGORY: 1510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_LOCALE: 1530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi av = CH_LOCALE[av] 1540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif flags & SRE_FLAG_UNICODE: 1550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi av = CH_UNICODE[av] 1560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(CHCODES[av]) 1570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is GROUPREF: 1580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_IGNORECASE: 1590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[OP_IGNORE[op]]) 1600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 1610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av-1) 1630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is GROUPREF_EXISTS: 1640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(av[0]-1) 1660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skipyes = _len(code); emit(0) 1670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[1], flags) 1680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if av[2]: 1690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[JUMP]) 1700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skipno = _len(code); emit(0) 1710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skipyes] = _len(code) - skipyes + 1 1720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, av[2], flags) 1730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skipno] = _len(code) - skipno 1740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 1750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skipyes] = _len(code) - skipyes + 1 1760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 1770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi raise ValueError, ("unsupported operand type", op) 1780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 1790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _compile_charset(charset, flags, code, fixup=None): 1800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # compile charset subprogram 1810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit = code.append 1820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if fixup is None: 1830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi fixup = _identityfunction 1840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for op, av in _optimize_charset(charset, fixup): 1850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[op]) 1860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is NEGATE: 1870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi pass 1880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is LITERAL: 1890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(fixup(av)) 1900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is RANGE: 1910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(fixup(av[0])) 1920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(fixup(av[1])) 1930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is CHARSET: 1940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code.extend(av) 1950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is BIGCHARSET: 1960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code.extend(av) 1970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is CATEGORY: 1980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if flags & SRE_FLAG_LOCALE: 1990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(CHCODES[CH_LOCALE[av]]) 2000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif flags & SRE_FLAG_UNICODE: 2010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(CHCODES[CH_UNICODE[av]]) 2020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 2030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(CHCODES[av]) 2040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 2050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi raise error, "internal: unsupported set operator" 2060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[FAILURE]) 2070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 2080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _optimize_charset(charset, fixup): 2090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # internal: optimize character set 2100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi out = [] 2110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi outappend = out.append 2120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap = [0]*256 2130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi try: 2140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for op, av in charset: 2150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is NEGATE: 2160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi outappend((op, av)) 2170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is LITERAL: 2180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap[fixup(av)] = 1 2190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is RANGE: 2200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for i in range(fixup(av[0]), fixup(av[1])+1): 2210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap[i] = 1 2220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is CATEGORY: 2230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # XXX: could append to charmap tail 2240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return charset # cannot compress 2250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi except IndexError: 2260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # character set contains unicode characters 2270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return _optimize_unicode(charset, fixup) 2280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # compress character map 2290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi i = p = n = 0 2300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi runs = [] 2310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi runsappend = runs.append 2320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for c in charmap: 2330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if c: 2340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if n == 0: 2350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi p = i 2360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi n = n + 1 2370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif n: 2380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi runsappend((p, n)) 2390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi n = 0 2400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi i = i + 1 2410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if n: 2420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi runsappend((p, n)) 2430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if len(runs) <= 2: 2440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # use literal/range 2450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for p, n in runs: 2460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if n == 1: 2470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi outappend((LITERAL, p)) 2480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 2490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi outappend((RANGE, (p, p+n-1))) 2500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if len(out) < len(charset): 2510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return out 2520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 2530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # use bitmap 2540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi data = _mk_bitmap(charmap) 2550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi outappend((CHARSET, data)) 2560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return out 2570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return charset 2580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 2590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _mk_bitmap(bits): 2600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi data = [] 2610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi dataappend = data.append 2620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if _sre.CODESIZE == 2: 2630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi start = (1, 0) 2640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 2650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi start = (1L, 0L) 2660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi m, v = start 2670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for c in bits: 2680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if c: 2690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi v = v + m 2700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi m = m + m 2710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if m > MAXCODE: 2720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi dataappend(v) 2730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi m, v = start 2740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return data 2750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 2760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# To represent a big charset, first a bitmap of all characters in the 2770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# set is constructed. Then, this bitmap is sliced into chunks of 256 2780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# characters, duplicate chunks are eliminated, and each chunk is 2790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# given a number. In the compiled expression, the charset is 2800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# represented by a 16-bit word sequence, consisting of one word for 2810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# the number of different chunks, a sequence of 256 bytes (128 words) 2820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# of chunk numbers indexed by their original chunk position, and a 2830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# sequence of chunks (16 words each). 2840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 2850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Compression is normally good: in a typical charset, large ranges of 2860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Unicode will be either completely excluded (e.g. if only cyrillic 2870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# letters are to be matched), or completely included (e.g. if large 2880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# subranges of Kanji match). These ranges will be represented by 2890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# chunks of all one-bits or all zero-bits. 2900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 2910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Matching can be also done efficiently: the more significant byte of 2920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# the Unicode character is an index into the chunk number, and the 2930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# less significant byte is a bit index in the chunk (just like the 2940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# CHARSET matching). 2950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 2960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 2970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# of the basic multilingual plane; an efficient representation 2980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# for all of UTF-16 has not yet been developed. This means, 2990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# in particular, that negated charsets cannot be represented as 3000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# bigcharsets. 3010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 3020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _optimize_unicode(charset, fixup): 3030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi try: 3040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi import array 3050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi except ImportError: 3060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return charset 3070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap = [0]*65536 3080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi negate = 0 3090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi try: 3100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for op, av in charset: 3110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is NEGATE: 3120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi negate = 1 3130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is LITERAL: 3140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap[fixup(av)] = 1 3150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is RANGE: 3160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for i in xrange(fixup(av[0]), fixup(av[1])+1): 3170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap[i] = 1 3180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is CATEGORY: 3190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # XXX: could expand category 3200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return charset # cannot compress 3210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi except IndexError: 3220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # non-BMP characters 3230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return charset 3240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if negate: 3250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if sys.maxunicode != 65535: 3260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # XXX: negation does not work with big charsets 3270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return charset 3280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for i in xrange(65536): 3290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charmap[i] = not charmap[i] 3300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi comps = {} 3310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mapping = [0]*256 3320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi block = 0 3330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi data = [] 3340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for i in xrange(256): 3350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi chunk = tuple(charmap[i*256:(i+1)*256]) 3360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi new = comps.setdefault(chunk, block) 3370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mapping[i] = new 3380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if new == block: 3390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi block = block + 1 3400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi data = data + _mk_bitmap(chunk) 3410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi header = [block] 3420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if _sre.CODESIZE == 2: 3430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code = 'H' 3440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 3450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code = 'I' 3460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # Convert block indices to byte array of 256 bytes 3470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mapping = array.array('b', mapping).tostring() 3480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # Convert byte array to word array 3490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mapping = array.array(code, mapping) 3500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi assert mapping.itemsize == _sre.CODESIZE 3510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi header = header + mapping.tolist() 3520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi data[0:0] = header 3530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return [(BIGCHARSET, data)] 3540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 3550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _simple(av): 3560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # check if av is a "simple" operator 3570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi lo, hi = av[2].getwidth() 3580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if lo == 0 and hi == MAXREPEAT: 3590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi raise error, "nothing to repeat" 3600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return lo == hi == 1 and av[2][0][0] != SUBPATTERN 3610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 3620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _compile_info(code, pattern, flags): 3630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # internal: compile an info block. in the current version, 3640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # this contains min/max pattern width, and an optional literal 3650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # prefix or a character map 3660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi lo, hi = pattern.getwidth() 3670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if lo == 0: 3680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return # not worth it 3690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # look for a literal prefix 3700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefix = [] 3710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefixappend = prefix.append 3720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefix_skip = 0 3730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charset = [] # not used 3740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charsetappend = charset.append 3750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if not (flags & SRE_FLAG_IGNORECASE): 3760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # look for literal prefix 3770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for op, av in pattern.data: 3780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is LITERAL: 3790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if len(prefix) == prefix_skip: 3800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefix_skip = prefix_skip + 1 3810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefixappend(av) 3820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is SUBPATTERN and len(av[1]) == 1: 3830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi op, av = av[1][0] 3840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is LITERAL: 3850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefixappend(av) 3860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 3870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi break 3880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 3890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi break 3900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # if no prefix, look for charset prefix 3910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if not prefix and pattern.data: 3920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi op, av = pattern.data[0] 3930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is SUBPATTERN and av[1]: 3940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi op, av = av[1][0] 3950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is LITERAL: 3960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charsetappend((op, av)) 3970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is BRANCH: 3980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi c = [] 3990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi cappend = c.append 4000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for p in av[1]: 4010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if not p: 4020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi break 4030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi op, av = p[0] 4040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is LITERAL: 4050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi cappend((op, av)) 4060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 4070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi break 4080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 4090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charset = c 4100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is BRANCH: 4110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi c = [] 4120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi cappend = c.append 4130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for p in av[1]: 4140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if not p: 4150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi break 4160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi op, av = p[0] 4170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if op is LITERAL: 4180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi cappend((op, av)) 4190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 4200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi break 4210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 4220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charset = c 4230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif op is IN: 4240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi charset = av 4250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi## if prefix: 4260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi## print "*** PREFIX", prefix, prefix_skip 4270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi## if charset: 4280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi## print "*** CHARSET", charset 4290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # add an info block 4300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit = code.append 4310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(OPCODES[INFO]) 4320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi skip = len(code); emit(0) 4330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # literal flag 4340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mask = 0 4350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if prefix: 4360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mask = SRE_INFO_PREFIX 4370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if len(prefix) == prefix_skip == len(pattern.data): 4380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mask = mask + SRE_INFO_LITERAL 4390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif charset: 4400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi mask = mask + SRE_INFO_CHARSET 4410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(mask) 4420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # pattern length 4430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if lo < MAXCODE: 4440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(lo) 4450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 4460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(MAXCODE) 4470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi prefix = prefix[:MAXCODE] 4480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if hi < MAXCODE: 4490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(hi) 4500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 4510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(0) 4520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # add literal prefix 4530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if prefix: 4540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(len(prefix)) # length 4550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi emit(prefix_skip) # skip 4560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code.extend(prefix) 4570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # generate overlap table 4580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi table = [-1] + ([0]*len(prefix)) 4590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for i in xrange(len(prefix)): 4600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi table[i+1] = table[i]+1 4610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 4620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi table[i+1] = table[table[i+1]-1]+1 4630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code.extend(table[1:]) # don't store first entry 4640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi elif charset: 4650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile_charset(charset, flags, code) 4660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code[skip] = len(code) - skip 4670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yitry: 4690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi unicode 4700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiexcept NameError: 4710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi STRING_TYPES = (type(""),) 4720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yielse: 4730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi STRING_TYPES = (type(""), type(unicode(""))) 4740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef isstring(obj): 4760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for tp in STRING_TYPES: 4770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if isinstance(obj, tp): 4780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return 1 4790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return 0 4800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _code(p, flags): 4820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi flags = p.pattern.flags | flags 4840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code = [] 4850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # compile info block 4870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile_info(code, p, flags) 4880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # compile the pattern 4900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi _compile(code, p.data, flags) 4910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code.append(OPCODES[SUCCESS]) 4930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return code 4950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef compile(p, flags=0): 4970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # internal: convert pattern list to internal format 4980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 4990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if isstring(p): 5000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi pattern = p 5010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi p = sre_parse.parse(p, flags) 5020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi else: 5030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi pattern = None 5040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 5050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi code = _code(p, flags) 5060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 5070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # print code 5080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 5090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # XXX: <fl> get rid of this limitation! 5100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi if p.pattern.groups > 100: 5110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi raise AssertionError( 5120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi "sorry, but this version only supports 100 named groups" 5130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi ) 5140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 5150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi # map in either direction 5160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi groupindex = p.pattern.groupdict 5170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi indexgroup = [None] * p.pattern.groups 5180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi for k, i in groupindex.items(): 5190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi indexgroup[i] = k 5200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi 5210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi return _sre.compile( 5220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi pattern, flags | p.pattern.flags, code, 5230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi p.pattern.groups-1, 5240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi groupindex, indexgroup 5250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi ) 526