1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#
2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Secret Labs' Regular Expression Engine
3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#
4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# convert template to internal format
5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#
6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#
8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# See the sre.py file for information on usage and redistribution.
9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh#
10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""Internal support module for sre"""
12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport _sre, sys
14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport sre_parse
15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom sre_constants import *
16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehfrom _sre import MAXREPEAT
17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehassert _sre.MAGIC == MAGIC, "SRE module mismatch"
19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehif _sre.CODESIZE == 2:
21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    MAXCODE = 65535
22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehelse:
23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    MAXCODE = 0xFFFFFFFFL
24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _identityfunction(x):
26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return x
27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_SUCCESS_CODES = set([SUCCESS, FAILURE])
31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _compile(code, pattern, flags):
34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # internal: compile a (sub)pattern
35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emit = code.append
36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _len = len
37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    LITERAL_CODES = _LITERAL_CODES
38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    REPEATING_CODES = _REPEATING_CODES
39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    SUCCESS_CODES = _SUCCESS_CODES
40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    ASSERT_CODES = _ASSERT_CODES
41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for op, av in pattern:
42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if op in LITERAL_CODES:
43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_IGNORECASE:
44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[OP_IGNORE[op]])
45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(_sre.getlower(av, flags))
46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[op])
48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av)
49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is IN:
50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_IGNORECASE:
51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[OP_IGNORE[op]])
52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                def fixup(literal, flags=flags):
53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    return _sre.getlower(literal, flags)
54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[op])
56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                fixup = _identityfunction
57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            skip = _len(code); emit(0)
58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            _compile_charset(av, flags, code, fixup)
59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            code[skip] = _len(code) - skip
60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is ANY:
61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_DOTALL:
62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[ANY_ALL])
63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[ANY])
65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op in REPEATING_CODES:
66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_TEMPLATE:
67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                raise error, "internal: unsupported template operator"
68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[REPEAT])
69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                skip = _len(code); emit(0)
70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av[0])
71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av[1])
72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                _compile(code, av[2], flags)
73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[SUCCESS])
74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skip] = _len(code) - skip
75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif _simple(av) and op is not REPEAT:
76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if op is MAX_REPEAT:
77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    emit(OPCODES[REPEAT_ONE])
78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    emit(OPCODES[MIN_REPEAT_ONE])
80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                skip = _len(code); emit(0)
81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av[0])
82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av[1])
83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                _compile(code, av[2], flags)
84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[SUCCESS])
85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skip] = _len(code) - skip
86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[REPEAT])
88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                skip = _len(code); emit(0)
89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av[0])
90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(av[1])
91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                _compile(code, av[2], flags)
92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skip] = _len(code) - skip
93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if op is MAX_REPEAT:
94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    emit(OPCODES[MAX_UNTIL])
95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    emit(OPCODES[MIN_UNTIL])
97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is SUBPATTERN:
98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if av[0]:
99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[MARK])
100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit((av[0]-1)*2)
101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # _compile_info(code, av[1], flags)
102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            _compile(code, av[1], flags)
103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if av[0]:
104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[MARK])
105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit((av[0]-1)*2+1)
106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op in SUCCESS_CODES:
107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op in ASSERT_CODES:
109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            skip = _len(code); emit(0)
111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if av[0] >= 0:
112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(0) # look ahead
113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                lo, hi = av[1].getwidth()
115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if lo != hi:
116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    raise error, "look-behind requires fixed-width pattern"
117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(lo) # look behind
118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            _compile(code, av[1], flags)
119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[SUCCESS])
120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            code[skip] = _len(code) - skip
121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is CALL:
122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            skip = _len(code); emit(0)
124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            _compile(code, av, flags)
125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[SUCCESS])
126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            code[skip] = _len(code) - skip
127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is AT:
128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_MULTILINE:
130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                av = AT_MULTILINE.get(av, av)
131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_LOCALE:
132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                av = AT_LOCALE.get(av, av)
133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif flags & SRE_FLAG_UNICODE:
134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                av = AT_UNICODE.get(av, av)
135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(ATCODES[av])
136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is BRANCH:
137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            tail = []
139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            tailappend = tail.append
140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for av in av[1]:
141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                skip = _len(code); emit(0)
142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # _compile_info(code, av, flags)
143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                _compile(code, av, flags)
144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[JUMP])
145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                tailappend(_len(code)); emit(0)
146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skip] = _len(code) - skip
147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(0) # end of branch
148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            for tail in tail:
149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[tail] = _len(code) - tail
150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is CATEGORY:
151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_LOCALE:
153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                av = CH_LOCALE[av]
154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif flags & SRE_FLAG_UNICODE:
155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                av = CH_UNICODE[av]
156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(CHCODES[av])
157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is GROUPREF:
158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_IGNORECASE:
159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[OP_IGNORE[op]])
160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[op])
162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(av-1)
163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is GROUPREF_EXISTS:
164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(OPCODES[op])
165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(av[0]-1)
166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            skipyes = _len(code); emit(0)
167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            _compile(code, av[1], flags)
168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if av[2]:
169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(OPCODES[JUMP])
170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                skipno = _len(code); emit(0)
171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skipyes] = _len(code) - skipyes + 1
172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                _compile(code, av[2], flags)
173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skipno] = _len(code) - skipno
174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                code[skipyes] = _len(code) - skipyes + 1
176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            raise ValueError, ("unsupported operand type", op)
178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _compile_charset(charset, flags, code, fixup=None):
180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # compile charset subprogram
181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emit = code.append
182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if fixup is None:
183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        fixup = _identityfunction
184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for op, av in _optimize_charset(charset, fixup):
185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(OPCODES[op])
186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if op is NEGATE:
187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            pass
188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is LITERAL:
189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(fixup(av))
190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is RANGE:
191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(fixup(av[0]))
192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            emit(fixup(av[1]))
193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is CHARSET:
194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            code.extend(av)
195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is BIGCHARSET:
196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            code.extend(av)
197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif op is CATEGORY:
198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if flags & SRE_FLAG_LOCALE:
199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(CHCODES[CH_LOCALE[av]])
200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif flags & SRE_FLAG_UNICODE:
201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(CHCODES[CH_UNICODE[av]])
202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                emit(CHCODES[av])
204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            raise error, "internal: unsupported set operator"
206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emit(OPCODES[FAILURE])
207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _optimize_charset(charset, fixup):
209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # internal: optimize character set
210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    out = []
211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    outappend = out.append
212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    charmap = [0]*256
213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    try:
214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for op, av in charset:
215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if op is NEGATE:
216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                outappend((op, av))
217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is LITERAL:
218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                charmap[fixup(av)] = 1
219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is RANGE:
220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for i in range(fixup(av[0]), fixup(av[1])+1):
221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    charmap[i] = 1
222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is CATEGORY:
223ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # XXX: could append to charmap tail
224ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return charset # cannot compress
225ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    except IndexError:
226ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # character set contains unicode characters
227ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return _optimize_unicode(charset, fixup)
228ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # compress character map
229ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    i = p = n = 0
230ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    runs = []
231ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    runsappend = runs.append
232ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for c in charmap:
233ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if c:
234ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if n == 0:
235ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                p = i
236ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            n = n + 1
237ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif n:
238ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            runsappend((p, n))
239ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            n = 0
240ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        i = i + 1
241ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if n:
242ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        runsappend((p, n))
243ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if len(runs) <= 2:
244ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # use literal/range
245ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for p, n in runs:
246ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if n == 1:
247ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                outappend((LITERAL, p))
248ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
249ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                outappend((RANGE, (p, p+n-1)))
250ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if len(out) < len(charset):
251ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return out
252ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
253ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # use bitmap
254ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        data = _mk_bitmap(charmap)
255ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        outappend((CHARSET, data))
256ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return out
257ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return charset
258ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
259ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _mk_bitmap(bits):
260ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    data = []
261ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    dataappend = data.append
262ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if _sre.CODESIZE == 2:
263ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        start = (1, 0)
264ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
265ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        start = (1L, 0L)
266ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    m, v = start
267ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for c in bits:
268ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if c:
269ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            v = v + m
270ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        m = m + m
271ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if m > MAXCODE:
272ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            dataappend(v)
273ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            m, v = start
274ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return data
275ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
276ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# To represent a big charset, first a bitmap of all characters in the
277ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# set is constructed. Then, this bitmap is sliced into chunks of 256
278ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# characters, duplicate chunks are eliminated, and each chunk is
279ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# given a number. In the compiled expression, the charset is
280ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# represented by a 16-bit word sequence, consisting of one word for
281ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# the number of different chunks, a sequence of 256 bytes (128 words)
282ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# of chunk numbers indexed by their original chunk position, and a
283ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# sequence of chunks (16 words each).
284ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
285ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Compression is normally good: in a typical charset, large ranges of
286ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Unicode will be either completely excluded (e.g. if only cyrillic
287ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# letters are to be matched), or completely included (e.g. if large
288ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# subranges of Kanji match). These ranges will be represented by
289ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# chunks of all one-bits or all zero-bits.
290ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
291ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# Matching can be also done efficiently: the more significant byte of
292ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# the Unicode character is an index into the chunk number, and the
293ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# less significant byte is a bit index in the chunk (just like the
294ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# CHARSET matching).
295ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
296ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
297ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# of the basic multilingual plane; an efficient representation
298ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# for all of UTF-16 has not yet been developed. This means,
299ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# in particular, that negated charsets cannot be represented as
300ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh# bigcharsets.
301ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
302ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _optimize_unicode(charset, fixup):
303ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    try:
304ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        import array
305ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    except ImportError:
306ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return charset
307ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    charmap = [0]*65536
308ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    negate = 0
309ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    try:
310ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for op, av in charset:
311ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if op is NEGATE:
312ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                negate = 1
313ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is LITERAL:
314ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                charmap[fixup(av)] = 1
315ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is RANGE:
316ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for i in xrange(fixup(av[0]), fixup(av[1])+1):
317ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    charmap[i] = 1
318ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is CATEGORY:
319ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # XXX: could expand category
320ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return charset # cannot compress
321ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    except IndexError:
322ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # non-BMP characters
323ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return charset
324ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if negate:
325ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if sys.maxunicode != 65535:
326ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # XXX: negation does not work with big charsets
327ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return charset
328ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i in xrange(65536):
329ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            charmap[i] = not charmap[i]
330ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    comps = {}
331ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    mapping = [0]*256
332ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    block = 0
333ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    data = []
334ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for i in xrange(256):
335ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        chunk = tuple(charmap[i*256:(i+1)*256])
336ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        new = comps.setdefault(chunk, block)
337ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        mapping[i] = new
338ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if new == block:
339ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            block = block + 1
340ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            data = data + _mk_bitmap(chunk)
341ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    header = [block]
342ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if _sre.CODESIZE == 2:
343ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        code = 'H'
344ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
345ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        code = 'I'
346ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Convert block indices to byte array of 256 bytes
347ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    mapping = array.array('b', mapping).tostring()
348ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # Convert byte array to word array
349ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    mapping = array.array(code, mapping)
350ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    assert mapping.itemsize == _sre.CODESIZE
351ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    header = header + mapping.tolist()
352ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    data[0:0] = header
353ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return [(BIGCHARSET, data)]
354ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
355ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _simple(av):
356ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # check if av is a "simple" operator
357ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    lo, hi = av[2].getwidth()
358ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if lo == 0 and hi == MAXREPEAT:
359ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise error, "nothing to repeat"
360ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
361ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
362ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _compile_info(code, pattern, flags):
363ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # internal: compile an info block.  in the current version,
364ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # this contains min/max pattern width, and an optional literal
365ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # prefix or a character map
366ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    lo, hi = pattern.getwidth()
367ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if lo == 0:
368ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return # not worth it
369ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # look for a literal prefix
370ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefix = []
371ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefixappend = prefix.append
372ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefix_skip = 0
373ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    charset = [] # not used
374ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    charsetappend = charset.append
375ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if not (flags & SRE_FLAG_IGNORECASE):
376ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # look for literal prefix
377ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for op, av in pattern.data:
378ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if op is LITERAL:
379ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if len(prefix) == prefix_skip:
380ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    prefix_skip = prefix_skip + 1
381ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                prefixappend(av)
382ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is SUBPATTERN and len(av[1]) == 1:
383ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                op, av = av[1][0]
384ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if op is LITERAL:
385ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    prefixappend(av)
386ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
387ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    break
388ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
389ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                break
390ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # if no prefix, look for charset prefix
391ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not prefix and pattern.data:
392ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            op, av = pattern.data[0]
393ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if op is SUBPATTERN and av[1]:
394ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                op, av = av[1][0]
395ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if op is LITERAL:
396ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    charsetappend((op, av))
397ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif op is BRANCH:
398ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    c = []
399ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    cappend = c.append
400ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    for p in av[1]:
401ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        if not p:
402ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                            break
403ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        op, av = p[0]
404ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        if op is LITERAL:
405ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                            cappend((op, av))
406ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        else:
407ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                            break
408ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    else:
409ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        charset = c
410ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is BRANCH:
411ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                c = []
412ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                cappend = c.append
413ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                for p in av[1]:
414ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    if not p:
415ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        break
416ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    op, av = p[0]
417ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    if op is LITERAL:
418ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        cappend((op, av))
419ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    else:
420ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        break
421ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
422ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    charset = c
423ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            elif op is IN:
424ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                charset = av
425ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh##     if prefix:
426ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh##         print "*** PREFIX", prefix, prefix_skip
427ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh##     if charset:
428ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh##         print "*** CHARSET", charset
429ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # add an info block
430ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emit = code.append
431ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emit(OPCODES[INFO])
432ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    skip = len(code); emit(0)
433ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # literal flag
434ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    mask = 0
435ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if prefix:
436ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        mask = SRE_INFO_PREFIX
437ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if len(prefix) == prefix_skip == len(pattern.data):
438ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            mask = mask + SRE_INFO_LITERAL
439ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    elif charset:
440ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        mask = mask + SRE_INFO_CHARSET
441ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    emit(mask)
442ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # pattern length
443ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if lo < MAXCODE:
444ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(lo)
445ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
446ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(MAXCODE)
447ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        prefix = prefix[:MAXCODE]
448ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if hi < MAXCODE:
449ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(hi)
450ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
451ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(0)
452ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # add literal prefix
453ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if prefix:
454ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(len(prefix)) # length
455ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        emit(prefix_skip) # skip
456ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        code.extend(prefix)
457ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # generate overlap table
458ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        table = [-1] + ([0]*len(prefix))
459ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for i in xrange(len(prefix)):
460ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            table[i+1] = table[i]+1
461ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
462ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                table[i+1] = table[table[i+1]-1]+1
463ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        code.extend(table[1:]) # don't store first entry
464ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    elif charset:
465ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        _compile_charset(charset, flags, code)
466ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    code[skip] = len(code) - skip
467ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
468ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehtry:
469ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    unicode
470ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehexcept NameError:
471ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    STRING_TYPES = (type(""),)
472ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehelse:
473ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    STRING_TYPES = (type(""), type(unicode("")))
474ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
475ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef isstring(obj):
476ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for tp in STRING_TYPES:
477ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if isinstance(obj, tp):
478ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return 1
479ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return 0
480ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
481ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef _code(p, flags):
482ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
483ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    flags = p.pattern.flags | flags
484ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    code = []
485ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
486ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # compile info block
487ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _compile_info(code, p, flags)
488ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
489ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # compile the pattern
490ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    _compile(code, p.data, flags)
491ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
492ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    code.append(OPCODES[SUCCESS])
493ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
494ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return code
495ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
496ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef compile(p, flags=0):
497ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # internal: convert pattern list to internal format
498ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
499ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if isstring(p):
500ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        pattern = p
501ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        p = sre_parse.parse(p, flags)
502ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    else:
503ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        pattern = None
504ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
505ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    code = _code(p, flags)
506ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
507ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # print code
508ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
509ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # XXX: <fl> get rid of this limitation!
510ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if p.pattern.groups > 100:
511ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        raise AssertionError(
512ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            "sorry, but this version only supports 100 named groups"
513ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            )
514ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
515ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    # map in either direction
516ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    groupindex = p.pattern.groupdict
517ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    indexgroup = [None] * p.pattern.groups
518ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    for k, i in groupindex.items():
519ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        indexgroup[i] = k
520ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
521ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return _sre.compile(
522ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        pattern, flags | p.pattern.flags, code,
523ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        p.pattern.groups-1,
524ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        groupindex, indexgroup
525ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        )
526