sre_compile.py revision 5e376a7809d7efe014f3f9b967a4554d79e10328
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre, sys
14import sre_parse
15from sre_constants import *
16from _sre import MAXREPEAT
17
18assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19
20if _sre.CODESIZE == 2:
21    MAXCODE = 65535
22else:
23    MAXCODE = 0xFFFFFFFF
24
25def _identityfunction(x):
26    return x
27
28_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
29_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
30_SUCCESS_CODES = set([SUCCESS, FAILURE])
31_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
32
33def _compile(code, pattern, flags):
34    # internal: compile a (sub)pattern
35    emit = code.append
36    _len = len
37    LITERAL_CODES = _LITERAL_CODES
38    REPEATING_CODES = _REPEATING_CODES
39    SUCCESS_CODES = _SUCCESS_CODES
40    ASSERT_CODES = _ASSERT_CODES
41    for op, av in pattern:
42        if op in LITERAL_CODES:
43            if flags & SRE_FLAG_IGNORECASE:
44                emit(OPCODES[OP_IGNORE[op]])
45                emit(_sre.getlower(av, flags))
46            else:
47                emit(OPCODES[op])
48                emit(av)
49        elif op is IN:
50            if flags & SRE_FLAG_IGNORECASE:
51                emit(OPCODES[OP_IGNORE[op]])
52                def fixup(literal, flags=flags):
53                    return _sre.getlower(literal, flags)
54            else:
55                emit(OPCODES[op])
56                fixup = _identityfunction
57            skip = _len(code); emit(0)
58            _compile_charset(av, flags, code, fixup)
59            code[skip] = _len(code) - skip
60        elif op is ANY:
61            if flags & SRE_FLAG_DOTALL:
62                emit(OPCODES[ANY_ALL])
63            else:
64                emit(OPCODES[ANY])
65        elif op in REPEATING_CODES:
66            if flags & SRE_FLAG_TEMPLATE:
67                raise error("internal: unsupported template operator")
68                emit(OPCODES[REPEAT])
69                skip = _len(code); emit(0)
70                emit(av[0])
71                emit(av[1])
72                _compile(code, av[2], flags)
73                emit(OPCODES[SUCCESS])
74                code[skip] = _len(code) - skip
75            elif _simple(av) and op is not REPEAT:
76                if op is MAX_REPEAT:
77                    emit(OPCODES[REPEAT_ONE])
78                else:
79                    emit(OPCODES[MIN_REPEAT_ONE])
80                skip = _len(code); emit(0)
81                emit(av[0])
82                emit(av[1])
83                _compile(code, av[2], flags)
84                emit(OPCODES[SUCCESS])
85                code[skip] = _len(code) - skip
86            else:
87                emit(OPCODES[REPEAT])
88                skip = _len(code); emit(0)
89                emit(av[0])
90                emit(av[1])
91                _compile(code, av[2], flags)
92                code[skip] = _len(code) - skip
93                if op is MAX_REPEAT:
94                    emit(OPCODES[MAX_UNTIL])
95                else:
96                    emit(OPCODES[MIN_UNTIL])
97        elif op is SUBPATTERN:
98            if av[0]:
99                emit(OPCODES[MARK])
100                emit((av[0]-1)*2)
101            # _compile_info(code, av[1], flags)
102            _compile(code, av[1], flags)
103            if av[0]:
104                emit(OPCODES[MARK])
105                emit((av[0]-1)*2+1)
106        elif op in SUCCESS_CODES:
107            emit(OPCODES[op])
108        elif op in ASSERT_CODES:
109            emit(OPCODES[op])
110            skip = _len(code); emit(0)
111            if av[0] >= 0:
112                emit(0) # look ahead
113            else:
114                lo, hi = av[1].getwidth()
115                if lo != hi:
116                    raise error("look-behind requires fixed-width pattern")
117                emit(lo) # look behind
118            _compile(code, av[1], flags)
119            emit(OPCODES[SUCCESS])
120            code[skip] = _len(code) - skip
121        elif op is CALL:
122            emit(OPCODES[op])
123            skip = _len(code); emit(0)
124            _compile(code, av, flags)
125            emit(OPCODES[SUCCESS])
126            code[skip] = _len(code) - skip
127        elif op is AT:
128            emit(OPCODES[op])
129            if flags & SRE_FLAG_MULTILINE:
130                av = AT_MULTILINE.get(av, av)
131            if flags & SRE_FLAG_LOCALE:
132                av = AT_LOCALE.get(av, av)
133            elif flags & SRE_FLAG_UNICODE:
134                av = AT_UNICODE.get(av, av)
135            emit(ATCODES[av])
136        elif op is BRANCH:
137            emit(OPCODES[op])
138            tail = []
139            tailappend = tail.append
140            for av in av[1]:
141                skip = _len(code); emit(0)
142                # _compile_info(code, av, flags)
143                _compile(code, av, flags)
144                emit(OPCODES[JUMP])
145                tailappend(_len(code)); emit(0)
146                code[skip] = _len(code) - skip
147            emit(0) # end of branch
148            for tail in tail:
149                code[tail] = _len(code) - tail
150        elif op is CATEGORY:
151            emit(OPCODES[op])
152            if flags & SRE_FLAG_LOCALE:
153                av = CH_LOCALE[av]
154            elif flags & SRE_FLAG_UNICODE:
155                av = CH_UNICODE[av]
156            emit(CHCODES[av])
157        elif op is GROUPREF:
158            if flags & SRE_FLAG_IGNORECASE:
159                emit(OPCODES[OP_IGNORE[op]])
160            else:
161                emit(OPCODES[op])
162            emit(av-1)
163        elif op is GROUPREF_EXISTS:
164            emit(OPCODES[op])
165            emit(av[0]-1)
166            skipyes = _len(code); emit(0)
167            _compile(code, av[1], flags)
168            if av[2]:
169                emit(OPCODES[JUMP])
170                skipno = _len(code); emit(0)
171                code[skipyes] = _len(code) - skipyes + 1
172                _compile(code, av[2], flags)
173                code[skipno] = _len(code) - skipno
174            else:
175                code[skipyes] = _len(code) - skipyes + 1
176        else:
177            raise ValueError("unsupported operand type", op)
178
179def _compile_charset(charset, flags, code, fixup=None):
180    # compile charset subprogram
181    emit = code.append
182    if fixup is None:
183        fixup = _identityfunction
184    for op, av in _optimize_charset(charset, fixup):
185        emit(OPCODES[op])
186        if op is NEGATE:
187            pass
188        elif op is LITERAL:
189            emit(fixup(av))
190        elif op is RANGE:
191            emit(fixup(av[0]))
192            emit(fixup(av[1]))
193        elif op is CHARSET:
194            code.extend(av)
195        elif op is BIGCHARSET:
196            code.extend(av)
197        elif op is CATEGORY:
198            if flags & SRE_FLAG_LOCALE:
199                emit(CHCODES[CH_LOCALE[av]])
200            elif flags & SRE_FLAG_UNICODE:
201                emit(CHCODES[CH_UNICODE[av]])
202            else:
203                emit(CHCODES[av])
204        else:
205            raise error("internal: unsupported set operator")
206    emit(OPCODES[FAILURE])
207
208def _optimize_charset(charset, fixup):
209    # internal: optimize character set
210    out = []
211    outappend = out.append
212    charmap = [0]*256
213    try:
214        for op, av in charset:
215            if op is NEGATE:
216                outappend((op, av))
217            elif op is LITERAL:
218                charmap[fixup(av)] = 1
219            elif op is RANGE:
220                for i in range(fixup(av[0]), fixup(av[1])+1):
221                    charmap[i] = 1
222            elif op is CATEGORY:
223                # XXX: could append to charmap tail
224                return charset # cannot compress
225    except IndexError:
226        # character set contains unicode characters
227        return _optimize_unicode(charset, fixup)
228    # compress character map
229    i = p = n = 0
230    runs = []
231    runsappend = runs.append
232    for c in charmap:
233        if c:
234            if n == 0:
235                p = i
236            n = n + 1
237        elif n:
238            runsappend((p, n))
239            n = 0
240        i = i + 1
241    if n:
242        runsappend((p, n))
243    if len(runs) <= 2:
244        # use literal/range
245        for p, n in runs:
246            if n == 1:
247                outappend((LITERAL, p))
248            else:
249                outappend((RANGE, (p, p+n-1)))
250        if len(out) < len(charset):
251            return out
252    else:
253        # use bitmap
254        data = _mk_bitmap(charmap)
255        outappend((CHARSET, data))
256        return out
257    return charset
258
259def _mk_bitmap(bits):
260    data = []
261    dataappend = data.append
262    if _sre.CODESIZE == 2:
263        start = (1, 0)
264    else:
265        start = (1, 0)
266    m, v = start
267    for c in bits:
268        if c:
269            v = v + m
270        m = m + m
271        if m > MAXCODE:
272            dataappend(v)
273            m, v = start
274    return data
275
276# To represent a big charset, first a bitmap of all characters in the
277# set is constructed. Then, this bitmap is sliced into chunks of 256
278# characters, duplicate chunks are eliminated, and each chunk is
279# given a number. In the compiled expression, the charset is
280# represented by a 16-bit word sequence, consisting of one word for
281# the number of different chunks, a sequence of 256 bytes (128 words)
282# of chunk numbers indexed by their original chunk position, and a
283# sequence of chunks (16 words each).
284
285# Compression is normally good: in a typical charset, large ranges of
286# Unicode will be either completely excluded (e.g. if only cyrillic
287# letters are to be matched), or completely included (e.g. if large
288# subranges of Kanji match). These ranges will be represented by
289# chunks of all one-bits or all zero-bits.
290
291# Matching can be also done efficiently: the more significant byte of
292# the Unicode character is an index into the chunk number, and the
293# less significant byte is a bit index in the chunk (just like the
294# CHARSET matching).
295
296# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
297# of the basic multilingual plane; an efficient representation
298# for all of UTF-16 has not yet been developed. This means,
299# in particular, that negated charsets cannot be represented as
300# bigcharsets.
301
302def _optimize_unicode(charset, fixup):
303    try:
304        import array
305    except ImportError:
306        return charset
307    charmap = [0]*65536
308    negate = 0
309    try:
310        for op, av in charset:
311            if op is NEGATE:
312                negate = 1
313            elif op is LITERAL:
314                charmap[fixup(av)] = 1
315            elif op is RANGE:
316                for i in range(fixup(av[0]), fixup(av[1])+1):
317                    charmap[i] = 1
318            elif op is CATEGORY:
319                # XXX: could expand category
320                return charset # cannot compress
321    except IndexError:
322        # non-BMP characters; XXX now they should work
323        return charset
324    if negate:
325        if sys.maxunicode != 65535:
326            # XXX: negation does not work with big charsets
327            # XXX2: now they should work, but removing this will make the
328            # charmap 17 times bigger
329            return charset
330        for i in range(65536):
331            charmap[i] = not charmap[i]
332    comps = {}
333    mapping = [0]*256
334    block = 0
335    data = []
336    for i in range(256):
337        chunk = tuple(charmap[i*256:(i+1)*256])
338        new = comps.setdefault(chunk, block)
339        mapping[i] = new
340        if new == block:
341            block = block + 1
342            data = data + _mk_bitmap(chunk)
343    header = [block]
344    if _sre.CODESIZE == 2:
345        code = 'H'
346    else:
347        code = 'I'
348    # Convert block indices to byte array of 256 bytes
349    mapping = array.array('b', mapping).tobytes()
350    # Convert byte array to word array
351    mapping = array.array(code, mapping)
352    assert mapping.itemsize == _sre.CODESIZE
353    assert len(mapping) * mapping.itemsize == 256
354    header = header + mapping.tolist()
355    data[0:0] = header
356    return [(BIGCHARSET, data)]
357
358def _simple(av):
359    # check if av is a "simple" operator
360    lo, hi = av[2].getwidth()
361    #if lo == 0 and hi == MAXREPEAT:
362    #    raise error("nothing to repeat")
363    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
364
365def _compile_info(code, pattern, flags):
366    # internal: compile an info block.  in the current version,
367    # this contains min/max pattern width, and an optional literal
368    # prefix or a character map
369    lo, hi = pattern.getwidth()
370    if lo == 0:
371        return # not worth it
372    # look for a literal prefix
373    prefix = []
374    prefixappend = prefix.append
375    prefix_skip = 0
376    charset = [] # not used
377    charsetappend = charset.append
378    if not (flags & SRE_FLAG_IGNORECASE):
379        # look for literal prefix
380        for op, av in pattern.data:
381            if op is LITERAL:
382                if len(prefix) == prefix_skip:
383                    prefix_skip = prefix_skip + 1
384                prefixappend(av)
385            elif op is SUBPATTERN and len(av[1]) == 1:
386                op, av = av[1][0]
387                if op is LITERAL:
388                    prefixappend(av)
389                else:
390                    break
391            else:
392                break
393        # if no prefix, look for charset prefix
394        if not prefix and pattern.data:
395            op, av = pattern.data[0]
396            if op is SUBPATTERN and av[1]:
397                op, av = av[1][0]
398                if op is LITERAL:
399                    charsetappend((op, av))
400                elif op is BRANCH:
401                    c = []
402                    cappend = c.append
403                    for p in av[1]:
404                        if not p:
405                            break
406                        op, av = p[0]
407                        if op is LITERAL:
408                            cappend((op, av))
409                        else:
410                            break
411                    else:
412                        charset = c
413            elif op is BRANCH:
414                c = []
415                cappend = c.append
416                for p in av[1]:
417                    if not p:
418                        break
419                    op, av = p[0]
420                    if op is LITERAL:
421                        cappend((op, av))
422                    else:
423                        break
424                else:
425                    charset = c
426            elif op is IN:
427                charset = av
428##     if prefix:
429##         print "*** PREFIX", prefix, prefix_skip
430##     if charset:
431##         print "*** CHARSET", charset
432    # add an info block
433    emit = code.append
434    emit(OPCODES[INFO])
435    skip = len(code); emit(0)
436    # literal flag
437    mask = 0
438    if prefix:
439        mask = SRE_INFO_PREFIX
440        if len(prefix) == prefix_skip == len(pattern.data):
441            mask = mask + SRE_INFO_LITERAL
442    elif charset:
443        mask = mask + SRE_INFO_CHARSET
444    emit(mask)
445    # pattern length
446    if lo < MAXCODE:
447        emit(lo)
448    else:
449        emit(MAXCODE)
450        prefix = prefix[:MAXCODE]
451    if hi < MAXCODE:
452        emit(hi)
453    else:
454        emit(0)
455    # add literal prefix
456    if prefix:
457        emit(len(prefix)) # length
458        emit(prefix_skip) # skip
459        code.extend(prefix)
460        # generate overlap table
461        table = [-1] + ([0]*len(prefix))
462        for i in range(len(prefix)):
463            table[i+1] = table[i]+1
464            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
465                table[i+1] = table[table[i+1]-1]+1
466        code.extend(table[1:]) # don't store first entry
467    elif charset:
468        _compile_charset(charset, flags, code)
469    code[skip] = len(code) - skip
470
471def isstring(obj):
472    return isinstance(obj, (str, bytes))
473
474def _code(p, flags):
475
476    flags = p.pattern.flags | flags
477    code = []
478
479    # compile info block
480    _compile_info(code, p, flags)
481
482    # compile the pattern
483    _compile(code, p.data, flags)
484
485    code.append(OPCODES[SUCCESS])
486
487    return code
488
489def compile(p, flags=0):
490    # internal: convert pattern list to internal format
491
492    if isstring(p):
493        pattern = p
494        p = sre_parse.parse(p, flags)
495    else:
496        pattern = None
497
498    code = _code(p, flags)
499
500    # print code
501
502    # XXX: <fl> get rid of this limitation!
503    if p.pattern.groups > 100:
504        raise AssertionError(
505            "sorry, but this version only supports 100 named groups"
506            )
507
508    # map in either direction
509    groupindex = p.pattern.groupdict
510    indexgroup = [None] * p.pattern.groups
511    for k, i in groupindex.items():
512        indexgroup[i] = k
513
514    return _sre.compile(
515        pattern, flags | p.pattern.flags, code,
516        p.pattern.groups-1,
517        groupindex, indexgroup
518        )
519