sre_compile.py revision 22d254652099e3a1f157543c7b1b37e3263e65c7
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
7#
8# Portions of this engine have been developed in cooperation with
9# CNRI.  Hewlett-Packard provided funding for 2.0 integration and
10# other compatibility work.
11#
12
13import array
14import _sre
15
16from sre_constants import *
17
18# find an array type code that matches the engine's code size
19for WORDSIZE in "BHil":
20    if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
21        break
22else:
23    raise RuntimeError, "cannot find a useable array type"
24
25def _compile(code, pattern, flags):
26    # internal: compile a (sub)pattern
27    emit = code.append
28    for op, av in pattern:
29        if op in (LITERAL, NOT_LITERAL):
30            if flags & SRE_FLAG_IGNORECASE:
31                emit(OPCODES[OP_IGNORE[op]])
32            else:
33                emit(OPCODES[op])
34            emit(av)
35        elif op is IN:
36            if flags & SRE_FLAG_IGNORECASE:
37                emit(OPCODES[OP_IGNORE[op]])
38                def fixup(literal, flags=flags):
39                    return _sre.getlower(literal, flags)
40            else:
41                emit(OPCODES[op])
42                fixup = lambda x: x
43            skip = len(code); emit(0)
44            for op, av in av:
45                emit(OPCODES[op])
46                if op is NEGATE:
47                    pass
48                elif op is LITERAL:
49                    emit(fixup(av))
50                elif op is RANGE:
51                    emit(fixup(av[0]))
52                    emit(fixup(av[1]))
53                elif op is CATEGORY:
54                    if flags & SRE_FLAG_LOCALE:
55                        emit(CHCODES[CH_LOCALE[av]])
56                    elif flags & SRE_FLAG_UNICODE:
57                        emit(CHCODES[CH_UNICODE[av]])
58                    else:
59                        emit(CHCODES[av])
60                else:
61                    raise error, "internal: unsupported set operator"
62            emit(OPCODES[FAILURE])
63            code[skip] = len(code) - skip
64        elif op is ANY:
65            if flags & SRE_FLAG_DOTALL:
66                emit(OPCODES[op])
67            else:
68                emit(OPCODES[CATEGORY])
69                emit(CHCODES[CATEGORY_NOT_LINEBREAK])
70        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
71            if flags & SRE_FLAG_TEMPLATE:
72                emit(OPCODES[REPEAT])
73                skip = len(code); emit(0)
74                emit(av[0])
75                emit(av[1])
76                _compile(code, av[2], flags)
77                emit(OPCODES[SUCCESS])
78                code[skip] = len(code) - skip
79            else:
80                lo, hi = av[2].getwidth()
81                if lo == 0:
82                    raise error, "nothing to repeat"
83                if 0 and lo == hi == 1 and op is MAX_REPEAT:
84                    # FIXME: <fl> need a better way to figure out when
85                    # it's safe to use this one (in the parser, probably)
86                    emit(OPCODES[MAX_REPEAT_ONE])
87                    skip = len(code); emit(0)
88                    emit(av[0])
89                    emit(av[1])
90                    _compile(code, av[2], flags)
91                    emit(OPCODES[SUCCESS])
92                    code[skip] = len(code) - skip
93                else:
94                    emit(OPCODES[op])
95                    skip = len(code); emit(0)
96                    emit(av[0])
97                    emit(av[1])
98                    _compile(code, av[2], flags)
99                    emit(OPCODES[SUCCESS])
100                    code[skip] = len(code) - skip
101        elif op is SUBPATTERN:
102            group = av[0]
103            if group:
104                emit(OPCODES[MARK])
105                emit((group-1)*2)
106            _compile(code, av[1], flags)
107            if group:
108                emit(OPCODES[MARK])
109                emit((group-1)*2+1)
110        elif op in (SUCCESS, FAILURE):
111            emit(OPCODES[op])
112        elif op in (ASSERT, ASSERT_NOT, CALL):
113            emit(OPCODES[op])
114            skip = len(code); emit(0)
115            _compile(code, av, flags)
116            emit(OPCODES[SUCCESS])
117            code[skip] = len(code) - skip
118        elif op is AT:
119            emit(OPCODES[op])
120            if flags & SRE_FLAG_MULTILINE:
121                emit(ATCODES[AT_MULTILINE.get(av, av)])
122            else:
123                emit(ATCODES[av])
124        elif op is BRANCH:
125            emit(OPCODES[op])
126            tail = []
127            for av in av[1]:
128                skip = len(code); emit(0)
129                _compile(code, av, flags)
130                emit(OPCODES[JUMP])
131                tail.append(len(code)); emit(0)
132                code[skip] = len(code) - skip
133            emit(0) # end of branch
134            for tail in tail:
135                code[tail] = len(code) - tail
136        elif op is CATEGORY:
137            emit(OPCODES[op])
138            if flags & SRE_FLAG_LOCALE:
139                emit(CHCODES[CH_LOCALE[av]])
140            elif flags & SRE_FLAG_UNICODE:
141                emit(CHCODES[CH_UNICODE[av]])
142            else:
143                emit(CHCODES[av])
144        elif op is GROUP:
145            if flags & SRE_FLAG_IGNORECASE:
146                emit(OPCODES[OP_IGNORE[op]])
147            else:
148                emit(OPCODES[op])
149            emit(av-1)
150        elif op is MARK:
151            emit(OPCODES[op])
152            emit(av)
153        else:
154            raise ValueError, ("unsupported operand type", op)
155
156def _compile_info(code, pattern, flags):
157    # internal: compile an info block.  in the current version,
158    # this contains min/max pattern width and a literal prefix,
159    # if any
160    lo, hi = pattern.getwidth()
161    if lo == 0:
162        return # not worth it
163    # look for a literal prefix
164    prefix = []
165    if not (flags & SRE_FLAG_IGNORECASE):
166        for op, av in pattern.data:
167            if op is LITERAL:
168                prefix.append(av)
169            else:
170                break
171    # add an info block
172    emit = code.append
173    emit(OPCODES[INFO])
174    skip = len(code); emit(0)
175    # literal flag
176    mask = 0
177    if len(prefix) == len(pattern.data):
178        mask = 1
179    emit(mask)
180    # pattern length
181    emit(lo)
182    if hi < 32768:
183        emit(hi)
184    else:
185        emit(0)
186    # add literal prefix
187    emit(len(prefix))
188    if prefix:
189        code.extend(prefix)
190        # generate overlap table
191        table = [-1] + ([0]*len(prefix))
192        for i in range(len(prefix)):
193            table[i+1] = table[i]+1
194            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
195                table[i+1] = table[table[i+1]-1]+1
196        code.extend(table[1:]) # don't store first entry
197    code[skip] = len(code) - skip
198
199def compile(p, flags=0):
200    # internal: convert pattern list to internal format
201
202    # compile, as necessary
203    if type(p) in (type(""), type(u"")):
204        import sre_parse
205        pattern = p
206        p = sre_parse.parse(p, flags)
207    else:
208        pattern = None
209
210    flags = p.pattern.flags | flags
211    code = []
212
213    # compile info block
214    _compile_info(code, p, flags)
215
216    # compile the pattern
217    _compile(code, p.data, flags)
218
219    code.append(OPCODES[SUCCESS])
220
221    # FIXME: <fl> get rid of this limitation!
222    assert p.pattern.groups <= 100,\
223           "sorry, but this version only supports 100 named groups"
224
225    return _sre.compile(
226        pattern, flags,
227        array.array(WORDSIZE, code).tostring(),
228        p.pattern.groups-1, p.pattern.groupdict
229        )
230