sre_compile.py revision 2f2c67d7e5934bdf96835f3c4774388b3e654314
1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11import _sre 12 13from sre_constants import * 14 15MAXCODE = 65535 16 17def _charset(charset, fixup=None): 18 # internal: optimize character set 19 if not fixup: 20 fixup = lambda x: x 21 out = [] 22 charmap = [0]*256 23 try: 24 for op, av in charset: 25 if op is NEGATE: 26 out.append((op, av)) 27 elif op is LITERAL: 28 charmap[fixup(av)] = 1 29 elif op is RANGE: 30 for i in range(fixup(av[0]), fixup(av[1])+1): 31 charmap[i] = 1 32 elif op is CATEGORY: 33 # FIXME: could append to charmap tail 34 return charset # cannot compress 35 except IndexError: 36 # unicode 37 return charset 38 # compress character map 39 i = p = n = 0 40 runs = [] 41 for c in charmap: 42 if c: 43 if n == 0: 44 p = i 45 n = n + 1 46 elif n: 47 runs.append((p, n)) 48 n = 0 49 i = i + 1 50 if n: 51 runs.append((p, n)) 52 if len(runs) <= 2: 53 # use literal/range 54 for p, n in runs: 55 if n == 1: 56 out.append((LITERAL, p)) 57 else: 58 out.append((RANGE, (p, p+n-1))) 59 if len(out) < len(charset): 60 return out 61 else: 62 # use bitmap 63 data = [] 64 m = 1; v = 0 65 for c in charmap: 66 if c: 67 v = v + m 68 m = m << 1 69 if m > MAXCODE: 70 data.append(v) 71 m = 1; v = 0 72 out.append((CHARSET, data)) 73 return out 74 return charset 75 76def _compile(code, pattern, flags): 77 # internal: compile a (sub)pattern 78 emit = code.append 79 for op, av in pattern: 80 if op in (LITERAL, NOT_LITERAL): 81 if flags & SRE_FLAG_IGNORECASE: 82 emit(OPCODES[OP_IGNORE[op]]) 83 else: 84 emit(OPCODES[op]) 85 emit(av) 86 elif op is IN: 87 if flags & SRE_FLAG_IGNORECASE: 88 emit(OPCODES[OP_IGNORE[op]]) 89 def fixup(literal, flags=flags): 90 return _sre.getlower(literal, flags) 91 else: 92 emit(OPCODES[op]) 93 fixup = lambda x: x 94 skip = len(code); emit(0) 95 for op, av in _charset(av, fixup): 96 emit(OPCODES[op]) 97 if op is NEGATE: 98 pass 99 elif op is LITERAL: 100 emit(fixup(av)) 101 elif op is RANGE: 102 emit(fixup(av[0])) 103 emit(fixup(av[1])) 104 elif op is CHARSET: 105 code.extend(av) 106 elif op is CATEGORY: 107 if flags & SRE_FLAG_LOCALE: 108 emit(CHCODES[CH_LOCALE[av]]) 109 elif flags & SRE_FLAG_UNICODE: 110 emit(CHCODES[CH_UNICODE[av]]) 111 else: 112 emit(CHCODES[av]) 113 else: 114 raise error, "internal: unsupported set operator" 115 emit(OPCODES[FAILURE]) 116 code[skip] = len(code) - skip 117 elif op is ANY: 118 if flags & SRE_FLAG_DOTALL: 119 emit(OPCODES[op]) 120 else: 121 emit(OPCODES[CATEGORY]) 122 emit(CHCODES[CATEGORY_NOT_LINEBREAK]) 123 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): 124 if flags & SRE_FLAG_TEMPLATE: 125 raise error, "internal: unsupported template operator" 126 emit(OPCODES[REPEAT]) 127 skip = len(code); emit(0) 128 emit(av[0]) 129 emit(av[1]) 130 _compile(code, av[2], flags) 131 emit(OPCODES[SUCCESS]) 132 code[skip] = len(code) - skip 133 else: 134 lo, hi = av[2].getwidth() 135 if lo == 0: 136 raise error, "nothing to repeat" 137 if 0 and lo == hi == 1 and op is MAX_REPEAT: 138 # FIXME: <fl> fast and wrong (but we'll fix that) 139 emit(OPCODES[REPEAT_ONE]) 140 skip = len(code); emit(0) 141 emit(av[0]) 142 emit(av[1]) 143 _compile(code, av[2], flags) 144 emit(OPCODES[SUCCESS]) 145 code[skip] = len(code) - skip 146 else: 147 emit(OPCODES[REPEAT]) 148 skip = len(code); emit(0) 149 emit(av[0]) 150 emit(av[1]) 151 _compile(code, av[2], flags) 152 code[skip] = len(code) - skip 153 if op == MAX_REPEAT: 154 emit(OPCODES[MAX_UNTIL]) 155 else: 156 emit(OPCODES[MIN_UNTIL]) 157 elif op is SUBPATTERN: 158 if av[0]: 159 emit(OPCODES[MARK]) 160 emit((av[0]-1)*2) 161 _compile(code, av[1], flags) 162 if av[0]: 163 emit(OPCODES[MARK]) 164 emit((av[0]-1)*2+1) 165 elif op in (SUCCESS, FAILURE): 166 emit(OPCODES[op]) 167 elif op in (ASSERT, ASSERT_NOT): 168 emit(OPCODES[op]) 169 skip = len(code); emit(0) 170 if av[0] >= 0: 171 emit(0) # look ahead 172 else: 173 lo, hi = av[1].getwidth() 174 if lo != hi: 175 raise error, "look-behind requires fixed-width pattern" 176 emit(lo) # look behind 177 _compile(code, av[1], flags) 178 emit(OPCODES[SUCCESS]) 179 code[skip] = len(code) - skip 180 elif op is CALL: 181 emit(OPCODES[op]) 182 skip = len(code); emit(0) 183 _compile(code, av, flags) 184 emit(OPCODES[SUCCESS]) 185 code[skip] = len(code) - skip 186 elif op is AT: 187 emit(OPCODES[op]) 188 if flags & SRE_FLAG_MULTILINE: 189 emit(ATCODES[AT_MULTILINE.get(av, av)]) 190 else: 191 emit(ATCODES[av]) 192 elif op is BRANCH: 193 emit(OPCODES[op]) 194 tail = [] 195 for av in av[1]: 196 skip = len(code); emit(0) 197 _compile(code, av, flags) 198 emit(OPCODES[JUMP]) 199 tail.append(len(code)); emit(0) 200 code[skip] = len(code) - skip 201 emit(0) # end of branch 202 for tail in tail: 203 code[tail] = len(code) - tail 204 elif op is CATEGORY: 205 emit(OPCODES[op]) 206 if flags & SRE_FLAG_LOCALE: 207 emit(CHCODES[CH_LOCALE[av]]) 208 elif flags & SRE_FLAG_UNICODE: 209 emit(CHCODES[CH_UNICODE[av]]) 210 else: 211 emit(CHCODES[av]) 212 elif op is GROUPREF: 213 if flags & SRE_FLAG_IGNORECASE: 214 emit(OPCODES[OP_IGNORE[op]]) 215 else: 216 emit(OPCODES[op]) 217 emit(av-1) 218 else: 219 raise ValueError, ("unsupported operand type", op) 220 221def _compile_info(code, pattern, flags): 222 # internal: compile an info block. in the current version, 223 # this contains min/max pattern width, and an optional literal 224 # prefix or a character map 225 lo, hi = pattern.getwidth() 226 if lo == 0: 227 return # not worth it 228 # look for a literal prefix 229 prefix = [] 230 charset = [] # not used 231 if not (flags & SRE_FLAG_IGNORECASE): 232 for op, av in pattern.data: 233 if op is LITERAL: 234 prefix.append(av) 235 else: 236 break 237 # add an info block 238 emit = code.append 239 emit(OPCODES[INFO]) 240 skip = len(code); emit(0) 241 # literal flag 242 mask = 0 243 if prefix: 244 mask = SRE_INFO_PREFIX 245 if len(prefix) == len(pattern.data): 246 mask = mask + SRE_INFO_LITERAL 247 elif charset: 248 mask = mask + SRE_INFO_CHARSET 249 emit(mask) 250 # pattern length 251 if lo < MAXCODE: 252 emit(lo) 253 else: 254 emit(MAXCODE) 255 prefix = prefix[:MAXCODE] 256 if hi < MAXCODE: 257 emit(hi) 258 else: 259 emit(0) 260 # add literal prefix 261 if prefix: 262 emit(len(prefix)) 263 if prefix: 264 code.extend(prefix) 265 # generate overlap table 266 table = [-1] + ([0]*len(prefix)) 267 for i in range(len(prefix)): 268 table[i+1] = table[i]+1 269 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 270 table[i+1] = table[table[i+1]-1]+1 271 code.extend(table[1:]) # don't store first entry 272 elif charset: 273 # FIXME: use charset optimizer! 274 for char in charset: 275 emit(OPCODES[LITERAL]) 276 emit(char) 277 emit(OPCODES[FAILURE]) 278 code[skip] = len(code) - skip 279 280STRING_TYPES = [type("")] 281 282try: 283 STRING_TYPES.append(type(unicode(""))) 284except NameError: 285 pass 286 287def _code(p, flags): 288 289 flags = p.pattern.flags | flags 290 code = [] 291 292 # compile info block 293 _compile_info(code, p, flags) 294 295 # compile the pattern 296 _compile(code, p.data, flags) 297 298 code.append(OPCODES[SUCCESS]) 299 300 return code 301 302def compile(p, flags=0): 303 # internal: convert pattern list to internal format 304 305 if type(p) in STRING_TYPES: 306 import sre_parse 307 pattern = p 308 p = sre_parse.parse(p, flags) 309 else: 310 pattern = None 311 312 code = _code(p, flags) 313 314 # print code 315 316 # FIXME: <fl> get rid of this limitation! 317 assert p.pattern.groups <= 100,\ 318 "sorry, but this version only supports 100 named groups" 319 320 # map in either direction 321 groupindex = p.pattern.groupdict 322 indexgroup = [None] * p.pattern.groups 323 for k, i in groupindex.items(): 324 indexgroup[i] = k 325 326 return _sre.compile( 327 pattern, flags, code, 328 p.pattern.groups-1, 329 groupindex, indexgroup 330 ) 331