sre_compile.py revision 4ccea94152599d7a80c01d5ebddb70f5abf3ce21
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. 7# 8# Portions of this engine have been developed in cooperation with 9# CNRI. Hewlett-Packard provided funding for 1.6 integration and 10# other compatibility work. 11# 12 13import array 14import _sre 15 16from sre_constants import * 17 18# find an array type code that matches the engine's code size 19for WORDSIZE in "BHil": 20 if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): 21 break 22else: 23 raise RuntimeError, "cannot find a useable array type" 24 25def _compile(code, pattern, flags): 26 # internal: compile a (sub)pattern 27 emit = code.append 28 for op, av in pattern: 29 if op in (LITERAL, NOT_LITERAL): 30 if flags & SRE_FLAG_IGNORECASE: 31 emit(OPCODES[OP_IGNORE[op]]) 32 else: 33 emit(OPCODES[op]) 34 emit(av) 35 elif op is IN: 36 if flags & SRE_FLAG_IGNORECASE: 37 emit(OPCODES[OP_IGNORE[op]]) 38 def fixup(literal, flags=flags): 39 return _sre.getlower(literal, flags) 40 else: 41 emit(OPCODES[op]) 42 fixup = lambda x: x 43 skip = len(code); emit(0) 44 for op, av in av: 45 emit(OPCODES[op]) 46 if op is NEGATE: 47 pass 48 elif op is LITERAL: 49 emit(fixup(av)) 50 elif op is RANGE: 51 emit(fixup(av[0])) 52 emit(fixup(av[1])) 53 elif op is CATEGORY: 54 if flags & SRE_FLAG_LOCALE: 55 emit(CHCODES[CH_LOCALE[av]]) 56 elif flags & SRE_FLAG_UNICODE: 57 emit(CHCODES[CH_UNICODE[av]]) 58 else: 59 emit(CHCODES[av]) 60 else: 61 raise error, "internal: unsupported set operator" 62 emit(OPCODES[FAILURE]) 63 code[skip] = len(code) - skip 64 elif op is ANY: 65 if flags & SRE_FLAG_DOTALL: 66 emit(OPCODES[op]) 67 else: 68 emit(OPCODES[CATEGORY]) 69 emit(CHCODES[CATEGORY_NOT_LINEBREAK]) 70 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): 71 if flags & SRE_FLAG_TEMPLATE: 72 emit(OPCODES[REPEAT]) 73 skip = len(code); emit(0) 74 emit(av[0]) 75 emit(av[1]) 76 _compile(code, av[2], flags) 77 emit(OPCODES[SUCCESS]) 78 code[skip] = len(code) - skip 79 else: 80 lo, hi = av[2].getwidth() 81 if lo == 0: 82 raise error, "nothing to repeat" 83 if 0 and lo == hi == 1 and op is MAX_REPEAT: 84 # FIXME: <fl> need a better way to figure out when 85 # it's safe to use this one (in the parser, probably) 86 emit(OPCODES[MAX_REPEAT_ONE]) 87 skip = len(code); emit(0) 88 emit(av[0]) 89 emit(av[1]) 90 _compile(code, av[2], flags) 91 emit(OPCODES[SUCCESS]) 92 code[skip] = len(code) - skip 93 else: 94 emit(OPCODES[op]) 95 skip = len(code); emit(0) 96 emit(av[0]) 97 emit(av[1]) 98 _compile(code, av[2], flags) 99 emit(OPCODES[SUCCESS]) 100 code[skip] = len(code) - skip 101 elif op is SUBPATTERN: 102 group = av[0] 103 if group: 104 emit(OPCODES[MARK]) 105 emit((group-1)*2) 106 _compile(code, av[1], flags) 107 if group: 108 emit(OPCODES[MARK]) 109 emit((group-1)*2+1) 110 elif op in (SUCCESS, FAILURE): 111 emit(OPCODES[op]) 112 elif op in (ASSERT, ASSERT_NOT, CALL): 113 emit(OPCODES[op]) 114 skip = len(code); emit(0) 115 _compile(code, av, flags) 116 emit(OPCODES[SUCCESS]) 117 code[skip] = len(code) - skip 118 elif op is AT: 119 emit(OPCODES[op]) 120 if flags & SRE_FLAG_MULTILINE: 121 emit(ATCODES[AT_MULTILINE[av]]) 122 else: 123 emit(ATCODES[av]) 124 elif op is BRANCH: 125 emit(OPCODES[op]) 126 tail = [] 127 for av in av[1]: 128 skip = len(code); emit(0) 129 _compile(code, av, flags) 130 emit(OPCODES[JUMP]) 131 tail.append(len(code)); emit(0) 132 code[skip] = len(code) - skip 133 emit(0) # end of branch 134 for tail in tail: 135 code[tail] = len(code) - tail 136 elif op is CATEGORY: 137 emit(OPCODES[op]) 138 if flags & SRE_FLAG_LOCALE: 139 emit(CHCODES[CH_LOCALE[av]]) 140 elif flags & SRE_FLAG_UNICODE: 141 emit(CHCODES[CH_UNICODE[av]]) 142 else: 143 emit(CHCODES[av]) 144 elif op is GROUP: 145 if flags & SRE_FLAG_IGNORECASE: 146 emit(OPCODES[OP_IGNORE[op]]) 147 else: 148 emit(OPCODES[op]) 149 emit(av-1) 150 elif op is MARK: 151 emit(OPCODES[op]) 152 emit(av) 153 else: 154 raise ValueError, ("unsupported operand type", op) 155 156def _compile_info(code, pattern, flags): 157 # internal: compile an info block. in the current version, 158 # this contains min/max pattern width and a literal prefix, 159 # if any 160 lo, hi = pattern.getwidth() 161 if lo == 0: 162 return # not worth it 163 # look for a literal prefix 164 prefix = [] 165 if not (flags & SRE_FLAG_IGNORECASE): 166 for op, av in pattern.data: 167 if op is LITERAL: 168 prefix.append(av) 169 else: 170 break 171 # add an info block 172 emit = code.append 173 emit(OPCODES[INFO]) 174 skip = len(code); emit(0) 175 # literal flag 176 mask = 0 177 if len(prefix) == len(pattern.data): 178 mask = 1 179 emit(mask) 180 # pattern length 181 emit(lo) 182 if hi < 32768: 183 emit(hi) 184 else: 185 emit(0) 186 # add literal prefix 187 emit(len(prefix)) 188 if prefix: 189 code.extend(prefix) 190 # generate overlap table 191 table = [-1] + ([0]*len(prefix)) 192 for i in range(len(prefix)): 193 table[i+1] = table[i]+1 194 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 195 table[i+1] = table[table[i+1]-1]+1 196 code.extend(table[1:]) # don't store first entry 197 code[skip] = len(code) - skip 198 199def compile(p, flags=0): 200 # internal: convert pattern list to internal format 201 202 # compile, as necessary 203 if type(p) in (type(""), type(u"")): 204 import sre_parse 205 pattern = p 206 p = sre_parse.parse(p) 207 else: 208 pattern = None 209 210 flags = p.pattern.flags | flags 211 code = [] 212 213 # compile info block 214 _compile_info(code, p, flags) 215 216 # compile the pattern 217 _compile(code, p.data, flags) 218 219 code.append(OPCODES[SUCCESS]) 220 221 # FIXME: <fl> get rid of this limitation! 222 assert p.pattern.groups <= 100,\ 223 "sorry, but this version only supports 100 named groups" 224 225 return _sre.compile( 226 pattern, flags, 227 array.array(WORDSIZE, code).tostring(), 228 p.pattern.groups-1, p.pattern.groupdict 229 ) 230