sre_compile.py revision 2f2c67d7e5934bdf96835f3c4774388b3e654314
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11import _sre
12
13from sre_constants import *
14
15MAXCODE = 65535
16
17def _charset(charset, fixup=None):
18    # internal: optimize character set
19    if not fixup:
20        fixup = lambda x: x
21    out = []
22    charmap = [0]*256
23    try:
24        for op, av in charset:
25            if op is NEGATE:
26                out.append((op, av))
27            elif op is LITERAL:
28                charmap[fixup(av)] = 1
29            elif op is RANGE:
30                for i in range(fixup(av[0]), fixup(av[1])+1):
31                    charmap[i] = 1
32            elif op is CATEGORY:
33                # FIXME: could append to charmap tail
34                return charset # cannot compress
35    except IndexError:
36        # unicode
37        return charset
38    # compress character map
39    i = p = n = 0
40    runs = []
41    for c in charmap:
42        if c:
43            if n == 0:
44                p = i
45            n = n + 1
46        elif n:
47            runs.append((p, n))
48            n = 0
49        i = i + 1
50    if n:
51        runs.append((p, n))
52    if len(runs) <= 2:
53        # use literal/range
54        for p, n in runs:
55            if n == 1:
56                out.append((LITERAL, p))
57            else:
58                out.append((RANGE, (p, p+n-1)))
59        if len(out) < len(charset):
60            return out
61    else:
62        # use bitmap
63        data = []
64        m = 1; v = 0
65        for c in charmap:
66            if c:
67                v = v + m
68            m = m << 1
69            if m > MAXCODE:
70                data.append(v)
71                m = 1; v = 0
72        out.append((CHARSET, data))
73        return out
74    return charset
75
76def _compile(code, pattern, flags):
77    # internal: compile a (sub)pattern
78    emit = code.append
79    for op, av in pattern:
80        if op in (LITERAL, NOT_LITERAL):
81            if flags & SRE_FLAG_IGNORECASE:
82                emit(OPCODES[OP_IGNORE[op]])
83            else:
84                emit(OPCODES[op])
85            emit(av)
86        elif op is IN:
87            if flags & SRE_FLAG_IGNORECASE:
88                emit(OPCODES[OP_IGNORE[op]])
89                def fixup(literal, flags=flags):
90                    return _sre.getlower(literal, flags)
91            else:
92                emit(OPCODES[op])
93                fixup = lambda x: x
94            skip = len(code); emit(0)
95            for op, av in _charset(av, fixup):
96                emit(OPCODES[op])
97                if op is NEGATE:
98                    pass
99                elif op is LITERAL:
100                    emit(fixup(av))
101                elif op is RANGE:
102                    emit(fixup(av[0]))
103                    emit(fixup(av[1]))
104                elif op is CHARSET:
105                    code.extend(av)
106                elif op is CATEGORY:
107                    if flags & SRE_FLAG_LOCALE:
108                        emit(CHCODES[CH_LOCALE[av]])
109                    elif flags & SRE_FLAG_UNICODE:
110                        emit(CHCODES[CH_UNICODE[av]])
111                    else:
112                        emit(CHCODES[av])
113                else:
114                    raise error, "internal: unsupported set operator"
115            emit(OPCODES[FAILURE])
116            code[skip] = len(code) - skip
117        elif op is ANY:
118            if flags & SRE_FLAG_DOTALL:
119                emit(OPCODES[op])
120            else:
121                emit(OPCODES[CATEGORY])
122                emit(CHCODES[CATEGORY_NOT_LINEBREAK])
123        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
124            if flags & SRE_FLAG_TEMPLATE:
125                raise error, "internal: unsupported template operator"
126                emit(OPCODES[REPEAT])
127                skip = len(code); emit(0)
128                emit(av[0])
129                emit(av[1])
130                _compile(code, av[2], flags)
131                emit(OPCODES[SUCCESS])
132                code[skip] = len(code) - skip
133            else:
134                lo, hi = av[2].getwidth()
135                if lo == 0:
136                    raise error, "nothing to repeat"
137                if 0 and lo == hi == 1 and op is MAX_REPEAT:
138                    # FIXME: <fl> fast and wrong (but we'll fix that)
139                    emit(OPCODES[REPEAT_ONE])
140                    skip = len(code); emit(0)
141                    emit(av[0])
142                    emit(av[1])
143                    _compile(code, av[2], flags)
144                    emit(OPCODES[SUCCESS])
145                    code[skip] = len(code) - skip
146                else:
147                    emit(OPCODES[REPEAT])
148                    skip = len(code); emit(0)
149                    emit(av[0])
150                    emit(av[1])
151                    _compile(code, av[2], flags)
152                    code[skip] = len(code) - skip
153                    if op == MAX_REPEAT:
154                        emit(OPCODES[MAX_UNTIL])
155                    else:
156                        emit(OPCODES[MIN_UNTIL])
157        elif op is SUBPATTERN:
158            if av[0]:
159                emit(OPCODES[MARK])
160                emit((av[0]-1)*2)
161            _compile(code, av[1], flags)
162            if av[0]:
163                emit(OPCODES[MARK])
164                emit((av[0]-1)*2+1)
165        elif op in (SUCCESS, FAILURE):
166            emit(OPCODES[op])
167        elif op in (ASSERT, ASSERT_NOT):
168            emit(OPCODES[op])
169            skip = len(code); emit(0)
170            if av[0] >= 0:
171                emit(0) # look ahead
172            else:
173                lo, hi = av[1].getwidth()
174                if lo != hi:
175                    raise error, "look-behind requires fixed-width pattern"
176                emit(lo) # look behind
177            _compile(code, av[1], flags)
178            emit(OPCODES[SUCCESS])
179            code[skip] = len(code) - skip
180        elif op is CALL:
181            emit(OPCODES[op])
182            skip = len(code); emit(0)
183            _compile(code, av, flags)
184            emit(OPCODES[SUCCESS])
185            code[skip] = len(code) - skip
186        elif op is AT:
187            emit(OPCODES[op])
188            if flags & SRE_FLAG_MULTILINE:
189                emit(ATCODES[AT_MULTILINE.get(av, av)])
190            else:
191                emit(ATCODES[av])
192        elif op is BRANCH:
193            emit(OPCODES[op])
194            tail = []
195            for av in av[1]:
196                skip = len(code); emit(0)
197                _compile(code, av, flags)
198                emit(OPCODES[JUMP])
199                tail.append(len(code)); emit(0)
200                code[skip] = len(code) - skip
201            emit(0) # end of branch
202            for tail in tail:
203                code[tail] = len(code) - tail
204        elif op is CATEGORY:
205            emit(OPCODES[op])
206            if flags & SRE_FLAG_LOCALE:
207                emit(CHCODES[CH_LOCALE[av]])
208            elif flags & SRE_FLAG_UNICODE:
209                emit(CHCODES[CH_UNICODE[av]])
210            else:
211                emit(CHCODES[av])
212        elif op is GROUPREF:
213            if flags & SRE_FLAG_IGNORECASE:
214                emit(OPCODES[OP_IGNORE[op]])
215            else:
216                emit(OPCODES[op])
217            emit(av-1)
218        else:
219            raise ValueError, ("unsupported operand type", op)
220
221def _compile_info(code, pattern, flags):
222    # internal: compile an info block.  in the current version,
223    # this contains min/max pattern width, and an optional literal
224    # prefix or a character map
225    lo, hi = pattern.getwidth()
226    if lo == 0:
227        return # not worth it
228    # look for a literal prefix
229    prefix = []
230    charset = [] # not used
231    if not (flags & SRE_FLAG_IGNORECASE):
232        for op, av in pattern.data:
233            if op is LITERAL:
234                prefix.append(av)
235            else:
236                break
237    # add an info block
238    emit = code.append
239    emit(OPCODES[INFO])
240    skip = len(code); emit(0)
241    # literal flag
242    mask = 0
243    if prefix:
244        mask = SRE_INFO_PREFIX
245        if len(prefix) == len(pattern.data):
246            mask = mask + SRE_INFO_LITERAL
247    elif charset:
248        mask = mask + SRE_INFO_CHARSET
249    emit(mask)
250    # pattern length
251    if lo < MAXCODE:
252        emit(lo)
253    else:
254        emit(MAXCODE)
255        prefix = prefix[:MAXCODE]
256    if hi < MAXCODE:
257        emit(hi)
258    else:
259        emit(0)
260    # add literal prefix
261    if prefix:
262        emit(len(prefix))
263        if prefix:
264            code.extend(prefix)
265            # generate overlap table
266            table = [-1] + ([0]*len(prefix))
267            for i in range(len(prefix)):
268                table[i+1] = table[i]+1
269                while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
270                    table[i+1] = table[table[i+1]-1]+1
271            code.extend(table[1:]) # don't store first entry
272    elif charset:
273        # FIXME: use charset optimizer!
274        for char in charset:
275            emit(OPCODES[LITERAL])
276            emit(char)
277        emit(OPCODES[FAILURE])
278    code[skip] = len(code) - skip
279
280STRING_TYPES = [type("")]
281
282try:
283    STRING_TYPES.append(type(unicode("")))
284except NameError:
285    pass
286
287def _code(p, flags):
288
289    flags = p.pattern.flags | flags
290    code = []
291
292    # compile info block
293    _compile_info(code, p, flags)
294
295    # compile the pattern
296    _compile(code, p.data, flags)
297
298    code.append(OPCODES[SUCCESS])
299
300    return code
301
302def compile(p, flags=0):
303    # internal: convert pattern list to internal format
304
305    if type(p) in STRING_TYPES:
306        import sre_parse
307        pattern = p
308        p = sre_parse.parse(p, flags)
309    else:
310        pattern = None
311
312    code = _code(p, flags)
313
314    # print code
315
316    # FIXME: <fl> get rid of this limitation!
317    assert p.pattern.groups <= 100,\
318           "sorry, but this version only supports 100 named groups"
319
320    # map in either direction
321    groupindex = p.pattern.groupdict
322    indexgroup = [None] * p.pattern.groups
323    for k, i in groupindex.items():
324        indexgroup[i] = k
325
326    return _sre.compile(
327        pattern, flags, code,
328        p.pattern.groups-1,
329        groupindex, indexgroup
330        )
331