sre_compile.py revision 29e383754e4a96b46d1bd9f72a49694cdb993850
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre, sys
14
15from sre_constants import *
16
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19if _sre.CODESIZE == 2:
20    MAXCODE = 65535
21else:
22    MAXCODE = 0xFFFFFFFFL
23
24def _identityfunction(x):
25    return x
26
27def _compile(code, pattern, flags):
28    # internal: compile a (sub)pattern
29    emit = code.append
30    _len = len
31    LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1}
32    REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1}
33    SUCCESS_CODES = {SUCCESS:1, FAILURE:1}
34    ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1}
35    for op, av in pattern:
36        if op in LITERAL_CODES:
37            if flags & SRE_FLAG_IGNORECASE:
38                emit(OPCODES[OP_IGNORE[op]])
39                emit(_sre.getlower(av, flags))
40            else:
41                emit(OPCODES[op])
42                emit(av)
43        elif op is IN:
44            if flags & SRE_FLAG_IGNORECASE:
45                emit(OPCODES[OP_IGNORE[op]])
46                def fixup(literal, flags=flags):
47                    return _sre.getlower(literal, flags)
48            else:
49                emit(OPCODES[op])
50                fixup = _identityfunction
51            skip = _len(code); emit(0)
52            _compile_charset(av, flags, code, fixup)
53            code[skip] = _len(code) - skip
54        elif op is ANY:
55            if flags & SRE_FLAG_DOTALL:
56                emit(OPCODES[ANY_ALL])
57            else:
58                emit(OPCODES[ANY])
59        elif op in REPEATING_CODES:
60            if flags & SRE_FLAG_TEMPLATE:
61                raise error, "internal: unsupported template operator"
62                emit(OPCODES[REPEAT])
63                skip = _len(code); emit(0)
64                emit(av[0])
65                emit(av[1])
66                _compile(code, av[2], flags)
67                emit(OPCODES[SUCCESS])
68                code[skip] = _len(code) - skip
69            elif _simple(av) and op is not REPEAT:
70                if op is MAX_REPEAT:
71                    emit(OPCODES[REPEAT_ONE])
72                else:
73                    emit(OPCODES[MIN_REPEAT_ONE])
74                skip = _len(code); emit(0)
75                emit(av[0])
76                emit(av[1])
77                _compile(code, av[2], flags)
78                emit(OPCODES[SUCCESS])
79                code[skip] = _len(code) - skip
80            else:
81                emit(OPCODES[REPEAT])
82                skip = _len(code); emit(0)
83                emit(av[0])
84                emit(av[1])
85                _compile(code, av[2], flags)
86                code[skip] = _len(code) - skip
87                if op is MAX_REPEAT:
88                    emit(OPCODES[MAX_UNTIL])
89                else:
90                    emit(OPCODES[MIN_UNTIL])
91        elif op is SUBPATTERN:
92            if av[0]:
93                emit(OPCODES[MARK])
94                emit((av[0]-1)*2)
95            # _compile_info(code, av[1], flags)
96            _compile(code, av[1], flags)
97            if av[0]:
98                emit(OPCODES[MARK])
99                emit((av[0]-1)*2+1)
100        elif op in SUCCESS_CODES:
101            emit(OPCODES[op])
102        elif op in ASSERT_CODES:
103            emit(OPCODES[op])
104            skip = _len(code); emit(0)
105            if av[0] >= 0:
106                emit(0) # look ahead
107            else:
108                lo, hi = av[1].getwidth()
109                if lo != hi:
110                    raise error, "look-behind requires fixed-width pattern"
111                emit(lo) # look behind
112            _compile(code, av[1], flags)
113            emit(OPCODES[SUCCESS])
114            code[skip] = _len(code) - skip
115        elif op is CALL:
116            emit(OPCODES[op])
117            skip = _len(code); emit(0)
118            _compile(code, av, flags)
119            emit(OPCODES[SUCCESS])
120            code[skip] = _len(code) - skip
121        elif op is AT:
122            emit(OPCODES[op])
123            if flags & SRE_FLAG_MULTILINE:
124                av = AT_MULTILINE.get(av, av)
125            if flags & SRE_FLAG_LOCALE:
126                av = AT_LOCALE.get(av, av)
127            elif flags & SRE_FLAG_UNICODE:
128                av = AT_UNICODE.get(av, av)
129            emit(ATCODES[av])
130        elif op is BRANCH:
131            emit(OPCODES[op])
132            tail = []
133            tailappend = tail.append
134            for av in av[1]:
135                skip = _len(code); emit(0)
136                # _compile_info(code, av, flags)
137                _compile(code, av, flags)
138                emit(OPCODES[JUMP])
139                tailappend(_len(code)); emit(0)
140                code[skip] = _len(code) - skip
141            emit(0) # end of branch
142            for tail in tail:
143                code[tail] = _len(code) - tail
144        elif op is CATEGORY:
145            emit(OPCODES[op])
146            if flags & SRE_FLAG_LOCALE:
147                av = CH_LOCALE[av]
148            elif flags & SRE_FLAG_UNICODE:
149                av = CH_UNICODE[av]
150            emit(CHCODES[av])
151        elif op is GROUPREF:
152            if flags & SRE_FLAG_IGNORECASE:
153                emit(OPCODES[OP_IGNORE[op]])
154            else:
155                emit(OPCODES[op])
156            emit(av-1)
157        elif op is GROUPREF_EXISTS:
158            emit(OPCODES[op])
159            emit((av[0]-1)*2)
160            skipyes = _len(code); emit(0)
161            _compile(code, av[1], flags)
162            if av[2]:
163                emit(OPCODES[JUMP])
164                skipno = _len(code); emit(0)
165                code[skipyes] = _len(code) - skipyes + 1
166                _compile(code, av[2], flags)
167                code[skipno] = _len(code) - skipno
168            else:
169                code[skipyes] = _len(code) - skipyes + 1
170        else:
171            raise ValueError, ("unsupported operand type", op)
172
173def _compile_charset(charset, flags, code, fixup=None):
174    # compile charset subprogram
175    emit = code.append
176    if fixup is None:
177        fixup = _identityfunction
178    for op, av in _optimize_charset(charset, fixup):
179        emit(OPCODES[op])
180        if op is NEGATE:
181            pass
182        elif op is LITERAL:
183            emit(fixup(av))
184        elif op is RANGE:
185            emit(fixup(av[0]))
186            emit(fixup(av[1]))
187        elif op is CHARSET:
188            code.extend(av)
189        elif op is BIGCHARSET:
190            code.extend(av)
191        elif op is CATEGORY:
192            if flags & SRE_FLAG_LOCALE:
193                emit(CHCODES[CH_LOCALE[av]])
194            elif flags & SRE_FLAG_UNICODE:
195                emit(CHCODES[CH_UNICODE[av]])
196            else:
197                emit(CHCODES[av])
198        else:
199            raise error, "internal: unsupported set operator"
200    emit(OPCODES[FAILURE])
201
202def _optimize_charset(charset, fixup):
203    # internal: optimize character set
204    out = []
205    outappend = out.append
206    charmap = [False]*256
207    try:
208        for op, av in charset:
209            if op is NEGATE:
210                outappend((op, av))
211            elif op is LITERAL:
212                charmap[fixup(av)] = True
213            elif op is RANGE:
214                for i in range(fixup(av[0]), fixup(av[1])+1):
215                    charmap[i] = True
216            elif op is CATEGORY:
217                # XXX: could append to charmap tail
218                return charset # cannot compress
219    except IndexError:
220        # character set contains unicode characters
221        return _optimize_unicode(charset, fixup)
222    # compress character map
223    i = p = n = 0
224    runs = []
225    runsappend = runs.append
226    for c in charmap:
227        if c:
228            if n == 0:
229                p = i
230            n = n + 1
231        elif n:
232            runsappend((p, n))
233            n = 0
234        i = i + 1
235    if n:
236        runsappend((p, n))
237    if len(runs) <= 2:
238        # use literal/range
239        for p, n in runs:
240            if n == 1:
241                outappend((LITERAL, p))
242            else:
243                outappend((RANGE, (p, p+n-1)))
244        if len(out) < len(charset):
245            return out
246    else:
247        # use bitmap
248        data = _mk_bitmap(charmap)
249        outappend((CHARSET, data))
250        return out
251    return charset
252
253def _mk_bitmap(bits):
254    data = []
255    dataappend = data.append
256    if _sre.CODESIZE == 2:
257        start = (1, 0)
258    else:
259        start = (1L, 0L)
260    m, v = start
261    for c in bits:
262        if c:
263            v = v + m
264        m = m + m
265        if m > MAXCODE:
266            dataappend(v)
267            m, v = start
268    return data
269
270# To represent a big charset, first a bitmap of all characters in the
271# set is constructed. Then, this bitmap is sliced into chunks of 256
272# characters, duplicate chunks are eliminitated, and each chunk is
273# given a number. In the compiled expression, the charset is
274# represented by a 16-bit word sequence, consisting of one word for
275# the number of different chunks, a sequence of 256 bytes (128 words)
276# of chunk numbers indexed by their original chunk position, and a
277# sequence of chunks (16 words each).
278
279# Compression is normally good: in a typical charset, large ranges of
280# Unicode will be either completely excluded (e.g. if only cyrillic
281# letters are to be matched), or completely included (e.g. if large
282# subranges of Kanji match). These ranges will be represented by
283# chunks of all one-bits or all zero-bits.
284
285# Matching can be also done efficiently: the more significant byte of
286# the Unicode character is an index into the chunk number, and the
287# less significant byte is a bit index in the chunk (just like the
288# CHARSET matching).
289
290# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
291# of the basic multilingual plane; an efficient representation
292# for all of UTF-16 has not yet been developed. This means,
293# in particular, that negated charsets cannot be represented as
294# bigcharsets.
295
296def _optimize_unicode(charset, fixup):
297    try:
298        import array
299    except ImportError:
300        return charset
301    charmap = [False]*65536
302    negate = 0
303    try:
304        for op, av in charset:
305            if op is NEGATE:
306                negate = 1
307            elif op is LITERAL:
308                charmap[fixup(av)] = True
309            elif op is RANGE:
310                for i in xrange(fixup(av[0]), fixup(av[1])+1):
311                    charmap[i] = True
312            elif op is CATEGORY:
313                # XXX: could expand category
314                return charset # cannot compress
315    except IndexError:
316        # non-BMP characters
317        return charset
318    if negate:
319        if sys.maxunicode != 65535:
320            # XXX: negation does not work with big charsets
321            return charset
322        for i in xrange(65536):
323            charmap[i] = not charmap[i]
324    comps = {}
325    mapping = [0]*256
326    block = 0
327    data = []
328    for i in xrange(256):
329        chunk = tuple(charmap[i*256:(i+1)*256])
330        new = comps.setdefault(chunk, block)
331        mapping[i] = new
332        if new == block:
333            block = block + 1
334            data = data + _mk_bitmap(chunk)
335    header = [block]
336    if MAXCODE == 65535:
337        code = 'H'
338    else:
339        code = 'L'
340    # Convert block indices to byte array of 256 bytes
341    mapping = array.array('b', mapping).tostring()
342    # Convert byte array to word array
343    header = header + array.array(code, mapping).tolist()
344    data[0:0] = header
345    return [(BIGCHARSET, data)]
346
347def _simple(av):
348    # check if av is a "simple" operator
349    lo, hi = av[2].getwidth()
350    if lo == 0 and hi == MAXREPEAT:
351        raise error, "nothing to repeat"
352    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
353
354def _compile_info(code, pattern, flags):
355    # internal: compile an info block.  in the current version,
356    # this contains min/max pattern width, and an optional literal
357    # prefix or a character map
358    lo, hi = pattern.getwidth()
359    if lo == 0:
360        return # not worth it
361    # look for a literal prefix
362    prefix = []
363    prefixappend = prefix.append
364    prefix_skip = 0
365    charset = [] # not used
366    charsetappend = charset.append
367    if not (flags & SRE_FLAG_IGNORECASE):
368        # look for literal prefix
369        for op, av in pattern.data:
370            if op is LITERAL:
371                if len(prefix) == prefix_skip:
372                    prefix_skip = prefix_skip + 1
373                prefixappend(av)
374            elif op is SUBPATTERN and len(av[1]) == 1:
375                op, av = av[1][0]
376                if op is LITERAL:
377                    prefixappend(av)
378                else:
379                    break
380            else:
381                break
382        # if no prefix, look for charset prefix
383        if not prefix and pattern.data:
384            op, av = pattern.data[0]
385            if op is SUBPATTERN and av[1]:
386                op, av = av[1][0]
387                if op is LITERAL:
388                    charsetappend((op, av))
389                elif op is BRANCH:
390                    c = []
391                    cappend = c.append
392                    for p in av[1]:
393                        if not p:
394                            break
395                        op, av = p[0]
396                        if op is LITERAL:
397                            cappend((op, av))
398                        else:
399                            break
400                    else:
401                        charset = c
402            elif op is BRANCH:
403                c = []
404                cappend = c.append
405                for p in av[1]:
406                    if not p:
407                        break
408                    op, av = p[0]
409                    if op is LITERAL:
410                        cappend((op, av))
411                    else:
412                        break
413                else:
414                    charset = c
415            elif op is IN:
416                charset = av
417##     if prefix:
418##         print "*** PREFIX", prefix, prefix_skip
419##     if charset:
420##         print "*** CHARSET", charset
421    # add an info block
422    emit = code.append
423    emit(OPCODES[INFO])
424    skip = len(code); emit(0)
425    # literal flag
426    mask = 0
427    if prefix:
428        mask = SRE_INFO_PREFIX
429        if len(prefix) == prefix_skip == len(pattern.data):
430            mask = mask + SRE_INFO_LITERAL
431    elif charset:
432        mask = mask + SRE_INFO_CHARSET
433    emit(mask)
434    # pattern length
435    if lo < MAXCODE:
436        emit(lo)
437    else:
438        emit(MAXCODE)
439        prefix = prefix[:MAXCODE]
440    if hi < MAXCODE:
441        emit(hi)
442    else:
443        emit(0)
444    # add literal prefix
445    if prefix:
446        emit(len(prefix)) # length
447        emit(prefix_skip) # skip
448        code.extend(prefix)
449        # generate overlap table
450        table = [-1] + ([0]*len(prefix))
451        for i in xrange(len(prefix)):
452            table[i+1] = table[i]+1
453            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
454                table[i+1] = table[table[i+1]-1]+1
455        code.extend(table[1:]) # don't store first entry
456    elif charset:
457        _compile_charset(charset, flags, code)
458    code[skip] = len(code) - skip
459
460try:
461    unicode
462except NameError:
463    STRING_TYPES = (type(""),)
464else:
465    STRING_TYPES = (type(""), type(unicode("")))
466
467def isstring(obj):
468    for tp in STRING_TYPES:
469        if isinstance(obj, tp):
470            return 1
471    return 0
472
473def _code(p, flags):
474
475    flags = p.pattern.flags | flags
476    code = []
477
478    # compile info block
479    _compile_info(code, p, flags)
480
481    # compile the pattern
482    _compile(code, p.data, flags)
483
484    code.append(OPCODES[SUCCESS])
485
486    return code
487
488def compile(p, flags=0):
489    # internal: convert pattern list to internal format
490
491    if isstring(p):
492        import sre_parse
493        pattern = p
494        p = sre_parse.parse(p, flags)
495    else:
496        pattern = None
497
498    code = _code(p, flags)
499
500    # print code
501
502    # XXX: <fl> get rid of this limitation!
503    assert p.pattern.groups <= 100,\
504           "sorry, but this version only supports 100 named groups"
505
506    # map in either direction
507    groupindex = p.pattern.groupdict
508    indexgroup = [None] * p.pattern.groups
509    for k, i in groupindex.items():
510        indexgroup[i] = k
511
512    return _sre.compile(
513        pattern, flags, code,
514        p.pattern.groups-1,
515        groupindex, indexgroup
516        )
517