1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# 2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Secret Labs' Regular Expression Engine 3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# 4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# convert template to internal format 5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# 6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# 8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# See the sre.py file for information on usage and redistribution. 9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# 10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""Internal support module for sre""" 12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport _sre, sys 14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport sre_parse 15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom sre_constants import * 16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom _sre import MAXREPEAT 17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehassert _sre.MAGIC == MAGIC, "SRE module mismatch" 19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehif _sre.CODESIZE == 2: 21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh MAXCODE = 65535 22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehelse: 23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh MAXCODE = 0xFFFFFFFFL 24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _identityfunction(x): 26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return x 27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_SUCCESS_CODES = set([SUCCESS, FAILURE]) 31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _compile(code, pattern, flags): 34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # internal: compile a (sub)pattern 35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit = code.append 36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _len = len 37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh LITERAL_CODES = _LITERAL_CODES 38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh REPEATING_CODES = _REPEATING_CODES 39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh SUCCESS_CODES = _SUCCESS_CODES 40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ASSERT_CODES = _ASSERT_CODES 41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for op, av in pattern: 42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op in LITERAL_CODES: 43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_IGNORECASE: 44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[OP_IGNORE[op]]) 45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(_sre.getlower(av, flags)) 46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av) 49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is IN: 50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_IGNORECASE: 51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[OP_IGNORE[op]]) 52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def fixup(literal, flags=flags): 53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return _sre.getlower(literal, flags) 54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh fixup = _identityfunction 57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile_charset(av, flags, code, fixup) 59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is ANY: 61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_DOTALL: 62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[ANY_ALL]) 63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[ANY]) 65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op in REPEATING_CODES: 66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_TEMPLATE: 67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise error, "internal: unsupported template operator" 68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[REPEAT]) 69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[0]) 71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[1]) 72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[2], flags) 73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[SUCCESS]) 74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif _simple(av) and op is not REPEAT: 76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is MAX_REPEAT: 77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[REPEAT_ONE]) 78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[MIN_REPEAT_ONE]) 80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[0]) 82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[1]) 83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[2], flags) 84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[SUCCESS]) 85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[REPEAT]) 88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[0]) 90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[1]) 91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[2], flags) 92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is MAX_REPEAT: 94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[MAX_UNTIL]) 95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[MIN_UNTIL]) 97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is SUBPATTERN: 98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if av[0]: 99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[MARK]) 100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit((av[0]-1)*2) 101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # _compile_info(code, av[1], flags) 102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[1], flags) 103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if av[0]: 104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[MARK]) 105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit((av[0]-1)*2+1) 106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op in SUCCESS_CODES: 107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op in ASSERT_CODES: 109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if av[0] >= 0: 112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(0) # look ahead 113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh lo, hi = av[1].getwidth() 115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if lo != hi: 116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise error, "look-behind requires fixed-width pattern" 117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(lo) # look behind 118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[1], flags) 119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[SUCCESS]) 120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is CALL: 122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av, flags) 125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[SUCCESS]) 126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is AT: 128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_MULTILINE: 130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh av = AT_MULTILINE.get(av, av) 131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_LOCALE: 132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh av = AT_LOCALE.get(av, av) 133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif flags & SRE_FLAG_UNICODE: 134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh av = AT_UNICODE.get(av, av) 135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(ATCODES[av]) 136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is BRANCH: 137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh tail = [] 139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh tailappend = tail.append 140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for av in av[1]: 141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = _len(code); emit(0) 142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # _compile_info(code, av, flags) 143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av, flags) 144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[JUMP]) 145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh tailappend(_len(code)); emit(0) 146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = _len(code) - skip 147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(0) # end of branch 148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for tail in tail: 149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[tail] = _len(code) - tail 150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is CATEGORY: 151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_LOCALE: 153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh av = CH_LOCALE[av] 154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif flags & SRE_FLAG_UNICODE: 155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh av = CH_UNICODE[av] 156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(CHCODES[av]) 157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is GROUPREF: 158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_IGNORECASE: 159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[OP_IGNORE[op]]) 160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av-1) 163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is GROUPREF_EXISTS: 164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(av[0]-1) 166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skipyes = _len(code); emit(0) 167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[1], flags) 168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if av[2]: 169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[JUMP]) 170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skipno = _len(code); emit(0) 171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skipyes] = _len(code) - skipyes + 1 172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, av[2], flags) 173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skipno] = _len(code) - skipno 174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skipyes] = _len(code) - skipyes + 1 176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise ValueError, ("unsupported operand type", op) 178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _compile_charset(charset, flags, code, fixup=None): 180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # compile charset subprogram 181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit = code.append 182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if fixup is None: 183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh fixup = _identityfunction 184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for op, av in _optimize_charset(charset, fixup): 185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[op]) 186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is NEGATE: 187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is LITERAL: 189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(fixup(av)) 190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is RANGE: 191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(fixup(av[0])) 192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(fixup(av[1])) 193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is CHARSET: 194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code.extend(av) 195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is BIGCHARSET: 196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code.extend(av) 197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is CATEGORY: 198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if flags & SRE_FLAG_LOCALE: 199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(CHCODES[CH_LOCALE[av]]) 200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif flags & SRE_FLAG_UNICODE: 201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(CHCODES[CH_UNICODE[av]]) 202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(CHCODES[av]) 204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise error, "internal: unsupported set operator" 206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[FAILURE]) 207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _optimize_charset(charset, fixup): 209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # internal: optimize character set 210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh out = [] 211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh outappend = out.append 212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap = [0]*256 213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for op, av in charset: 215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is NEGATE: 216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh outappend((op, av)) 217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is LITERAL: 218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap[fixup(av)] = 1 219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is RANGE: 220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for i in range(fixup(av[0]), fixup(av[1])+1): 221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap[i] = 1 222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is CATEGORY: 223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # XXX: could append to charmap tail 224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return charset # cannot compress 225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except IndexError: 226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # character set contains unicode characters 227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return _optimize_unicode(charset, fixup) 228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # compress character map 229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh i = p = n = 0 230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh runs = [] 231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh runsappend = runs.append 232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for c in charmap: 233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if c: 234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if n == 0: 235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh p = i 236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh n = n + 1 237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif n: 238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh runsappend((p, n)) 239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh n = 0 240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh i = i + 1 241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if n: 242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh runsappend((p, n)) 243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(runs) <= 2: 244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # use literal/range 245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for p, n in runs: 246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if n == 1: 247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh outappend((LITERAL, p)) 248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh outappend((RANGE, (p, p+n-1))) 250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(out) < len(charset): 251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return out 252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # use bitmap 254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = _mk_bitmap(charmap) 255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh outappend((CHARSET, data)) 256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return out 257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return charset 258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _mk_bitmap(bits): 260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = [] 261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh dataappend = data.append 262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if _sre.CODESIZE == 2: 263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh start = (1, 0) 264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh start = (1L, 0L) 266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh m, v = start 267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for c in bits: 268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if c: 269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh v = v + m 270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh m = m + m 271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if m > MAXCODE: 272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh dataappend(v) 273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh m, v = start 274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return data 275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# To represent a big charset, first a bitmap of all characters in the 277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# set is constructed. Then, this bitmap is sliced into chunks of 256 278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# characters, duplicate chunks are eliminated, and each chunk is 279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# given a number. In the compiled expression, the charset is 280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# represented by a 16-bit word sequence, consisting of one word for 281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# the number of different chunks, a sequence of 256 bytes (128 words) 282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# of chunk numbers indexed by their original chunk position, and a 283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# sequence of chunks (16 words each). 284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Compression is normally good: in a typical charset, large ranges of 286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Unicode will be either completely excluded (e.g. if only cyrillic 287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# letters are to be matched), or completely included (e.g. if large 288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# subranges of Kanji match). These ranges will be represented by 289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# chunks of all one-bits or all zero-bits. 290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Matching can be also done efficiently: the more significant byte of 292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# the Unicode character is an index into the chunk number, and the 293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# less significant byte is a bit index in the chunk (just like the 294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# CHARSET matching). 295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# of the basic multilingual plane; an efficient representation 298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# for all of UTF-16 has not yet been developed. This means, 299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# in particular, that negated charsets cannot be represented as 300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# bigcharsets. 301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _optimize_unicode(charset, fixup): 303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh import array 305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except ImportError: 306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return charset 307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap = [0]*65536 308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh negate = 0 309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for op, av in charset: 311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is NEGATE: 312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh negate = 1 313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is LITERAL: 314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap[fixup(av)] = 1 315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is RANGE: 316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for i in xrange(fixup(av[0]), fixup(av[1])+1): 317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap[i] = 1 318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is CATEGORY: 319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # XXX: could expand category 320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return charset # cannot compress 321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except IndexError: 322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # non-BMP characters 323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return charset 324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if negate: 325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if sys.maxunicode != 65535: 326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # XXX: negation does not work with big charsets 327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return charset 328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for i in xrange(65536): 329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charmap[i] = not charmap[i] 330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh comps = {} 331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mapping = [0]*256 332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh block = 0 333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = [] 334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for i in xrange(256): 335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh chunk = tuple(charmap[i*256:(i+1)*256]) 336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh new = comps.setdefault(chunk, block) 337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mapping[i] = new 338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if new == block: 339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh block = block + 1 340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data = data + _mk_bitmap(chunk) 341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh header = [block] 342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if _sre.CODESIZE == 2: 343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code = 'H' 344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code = 'I' 346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Convert block indices to byte array of 256 bytes 347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mapping = array.array('b', mapping).tostring() 348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # Convert byte array to word array 349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mapping = array.array(code, mapping) 350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh assert mapping.itemsize == _sre.CODESIZE 351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh header = header + mapping.tolist() 352ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh data[0:0] = header 353ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return [(BIGCHARSET, data)] 354ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 355ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _simple(av): 356ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # check if av is a "simple" operator 357ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh lo, hi = av[2].getwidth() 358ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if lo == 0 and hi == MAXREPEAT: 359ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise error, "nothing to repeat" 360ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return lo == hi == 1 and av[2][0][0] != SUBPATTERN 361ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 362ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _compile_info(code, pattern, flags): 363ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # internal: compile an info block. in the current version, 364ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # this contains min/max pattern width, and an optional literal 365ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # prefix or a character map 366ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh lo, hi = pattern.getwidth() 367ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if lo == 0: 368ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return # not worth it 369ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # look for a literal prefix 370ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix = [] 371ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefixappend = prefix.append 372ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix_skip = 0 373ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charset = [] # not used 374ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charsetappend = charset.append 375ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not (flags & SRE_FLAG_IGNORECASE): 376ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # look for literal prefix 377ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for op, av in pattern.data: 378ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is LITERAL: 379ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(prefix) == prefix_skip: 380ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix_skip = prefix_skip + 1 381ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefixappend(av) 382ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is SUBPATTERN and len(av[1]) == 1: 383ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh op, av = av[1][0] 384ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is LITERAL: 385ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefixappend(av) 386ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 387ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh break 388ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 389ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh break 390ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # if no prefix, look for charset prefix 391ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not prefix and pattern.data: 392ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh op, av = pattern.data[0] 393ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is SUBPATTERN and av[1]: 394ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh op, av = av[1][0] 395ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is LITERAL: 396ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charsetappend((op, av)) 397ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is BRANCH: 398ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh c = [] 399ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh cappend = c.append 400ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for p in av[1]: 401ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not p: 402ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh break 403ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh op, av = p[0] 404ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is LITERAL: 405ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh cappend((op, av)) 406ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 407ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh break 408ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 409ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charset = c 410ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is BRANCH: 411ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh c = [] 412ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh cappend = c.append 413ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for p in av[1]: 414ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not p: 415ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh break 416ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh op, av = p[0] 417ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if op is LITERAL: 418ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh cappend((op, av)) 419ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 420ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh break 421ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 422ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charset = c 423ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif op is IN: 424ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh charset = av 425ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh## if prefix: 426ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh## print "*** PREFIX", prefix, prefix_skip 427ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh## if charset: 428ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh## print "*** CHARSET", charset 429ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # add an info block 430ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit = code.append 431ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(OPCODES[INFO]) 432ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh skip = len(code); emit(0) 433ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # literal flag 434ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mask = 0 435ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if prefix: 436ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mask = SRE_INFO_PREFIX 437ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(prefix) == prefix_skip == len(pattern.data): 438ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mask = mask + SRE_INFO_LITERAL 439ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif charset: 440ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh mask = mask + SRE_INFO_CHARSET 441ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(mask) 442ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # pattern length 443ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if lo < MAXCODE: 444ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(lo) 445ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 446ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(MAXCODE) 447ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix = prefix[:MAXCODE] 448ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if hi < MAXCODE: 449ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(hi) 450ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 451ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(0) 452ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # add literal prefix 453ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if prefix: 454ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(len(prefix)) # length 455ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh emit(prefix_skip) # skip 456ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code.extend(prefix) 457ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # generate overlap table 458ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh table = [-1] + ([0]*len(prefix)) 459ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for i in xrange(len(prefix)): 460ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh table[i+1] = table[i]+1 461ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 462ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh table[i+1] = table[table[i+1]-1]+1 463ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code.extend(table[1:]) # don't store first entry 464ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif charset: 465ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile_charset(charset, flags, code) 466ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code[skip] = len(code) - skip 467ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 468ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehtry: 469ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh unicode 470ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehexcept NameError: 471ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh STRING_TYPES = (type(""),) 472ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehelse: 473ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh STRING_TYPES = (type(""), type(unicode(""))) 474ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 475ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef isstring(obj): 476ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for tp in STRING_TYPES: 477ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if isinstance(obj, tp): 478ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return 1 479ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return 0 480ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 481ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _code(p, flags): 482ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 483ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh flags = p.pattern.flags | flags 484ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code = [] 485ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 486ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # compile info block 487ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile_info(code, p, flags) 488ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 489ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # compile the pattern 490ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh _compile(code, p.data, flags) 491ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 492ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code.append(OPCODES[SUCCESS]) 493ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 494ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return code 495ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 496ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef compile(p, flags=0): 497ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # internal: convert pattern list to internal format 498ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 499ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if isstring(p): 500ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pattern = p 501ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh p = sre_parse.parse(p, flags) 502ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 503ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pattern = None 504ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 505ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh code = _code(p, flags) 506ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 507ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # print code 508ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 509ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # XXX: <fl> get rid of this limitation! 510ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if p.pattern.groups > 100: 511ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh raise AssertionError( 512ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh "sorry, but this version only supports 100 named groups" 513ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ) 514ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 515ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # map in either direction 516ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh groupindex = p.pattern.groupdict 517ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh indexgroup = [None] * p.pattern.groups 518ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for k, i in groupindex.items(): 519ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh indexgroup[i] = k 520ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 521ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return _sre.compile( 522ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pattern, flags | p.pattern.flags, code, 523ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh p.pattern.groups-1, 524ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh groupindex, indexgroup 525ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ) 526