1e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka# -*- coding: utf-8 -*-
27627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum#
37627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# Secret Labs' Regular Expression Engine
47627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum#
57627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# convert template to internal format
67627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum#
7770617b23e286f1147f9480b5f625e88e7badd50Fredrik Lundh# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
87627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum#
929c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh# See the sre.py file for information on usage and redistribution.
107627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum#
117627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum
12b8f22749853cf79bfbe3709309e67d1a448f4cabFred Drake"""Internal support module for sre"""
13b8f22749853cf79bfbe3709309e67d1a448f4cabFred Drake
144fb7027ec00daa8c5f891888dedc1b387853c1e0Fredrik Lundhimport _sre, sys
154b798bdf8ab3a4a4b3b11ea60a8f0b1c54e43224Amaury Forgeot d'Arcimport sre_parse
167627c0de6968471996ce05aab200115d56efa1d5Guido van Rossumfrom sre_constants import *
177627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum
18b35ffc0417a0861ccf466c0503c4151725a0267aFredrik Lundhassert _sre.MAGIC == MAGIC, "SRE module mismatch"
19b35ffc0417a0861ccf466c0503c4151725a0267aFredrik Lundh
2078e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwisif _sre.CODESIZE == 2:
2178e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis    MAXCODE = 65535
2278e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwiselse:
2378e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis    MAXCODE = 0xFFFFFFFFL
243562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh
25049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
26049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
27049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_SUCCESS_CODES = set([SUCCESS, FAILURE])
28049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
29049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger
30e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka# Sets of lowercase characters which have the same uppercase.
31e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka_equivalences = (
32e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
33e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x69, 0x131), # iı
34e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
35e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x73, 0x17f), # sſ
36e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # MICRO SIGN, GREEK SMALL LETTER MU
37e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0xb5, 0x3bc), # µμ
38e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
39e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
40e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
41e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3b2, 0x3d0), # βϐ
42e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
43e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3b5, 0x3f5), # εϵ
44e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
45e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3b8, 0x3d1), # θϑ
46e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
47e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3ba, 0x3f0), # κϰ
48e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
49e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3c0, 0x3d6), # πϖ
50e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
51e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3c1, 0x3f1), # ρϱ
52e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
53e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3c2, 0x3c3), # ςσ
54e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
55e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x3c6, 0x3d5), # φϕ
56e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
57e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    (0x1e61, 0x1e9b), # ṡẛ
58e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka)
59e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka
60e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka# Maps the lowercase code to lowercase codes which have the same uppercase.
61e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                     for t in _equivalences for i in t}
63e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka
64436c3d58a2570f3b599e59b4071f944f774ec441Fredrik Lundhdef _compile(code, pattern, flags):
6529c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # internal: compile a (sub)pattern
66be2211e9401a0be96915c473ef99041beb5a4992Fredrik Lundh    emit = code.append
6701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger    _len = len
68049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger    LITERAL_CODES = _LITERAL_CODES
69049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger    REPEATING_CODES = _REPEATING_CODES
70049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger    SUCCESS_CODES = _SUCCESS_CODES
71049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger    ASSERT_CODES = _ASSERT_CODES
72e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    if (flags & SRE_FLAG_IGNORECASE and
73e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka            not (flags & SRE_FLAG_LOCALE) and
74e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka            flags & SRE_FLAG_UNICODE):
75e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka        fixes = _ignorecase_fixes
76e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    else:
77e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka        fixes = None
787627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum    for op, av in pattern:
7901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger        if op in LITERAL_CODES:
8090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            if flags & SRE_FLAG_IGNORECASE:
81e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                lo = _sre.getlower(av, flags)
82e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                if fixes and lo in fixes:
83e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    emit(OPCODES[IN_IGNORE])
84e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    skip = _len(code); emit(0)
85e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    if op is NOT_LITERAL:
86e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        emit(OPCODES[NEGATE])
87e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    for k in (lo,) + fixes[lo]:
88e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        emit(OPCODES[LITERAL])
89e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        emit(k)
90e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    emit(OPCODES[FAILURE])
91e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    code[skip] = _len(code) - skip
92e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                else:
93e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    emit(OPCODES[OP_IGNORE[op]])
94e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    emit(lo)
9590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            else:
9690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[op])
972e24044f9db23c3d2195a129f53f2deb73a4e4afFredrik Lundh                emit(av)
9890a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        elif op is IN:
9990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            if flags & SRE_FLAG_IGNORECASE:
10090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[OP_IGNORE[op]])
10190a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                def fixup(literal, flags=flags):
1020640e1161f37fd3415e9efdbde1e293efb98978cFredrik Lundh                    return _sre.getlower(literal, flags)
10390a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            else:
10490a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[op])
105e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                fixup = None
10601c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            skip = _len(code); emit(0)
107e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka            _compile_charset(av, flags, code, fixup, fixes)
10801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            code[skip] = _len(code) - skip
10943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh        elif op is ANY:
11043b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            if flags & SRE_FLAG_DOTALL:
111e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(OPCODES[ANY_ALL])
11243b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            else:
113e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(OPCODES[ANY])
11401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger        elif op in REPEATING_CODES:
11590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            if flags & SRE_FLAG_TEMPLATE:
11629c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh                raise error, "internal: unsupported template operator"
11790a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[REPEAT])
11801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                skip = _len(code); emit(0)
11990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(av[0])
12090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(av[1])
12190a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                _compile(code, av[2], flags)
12290a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[SUCCESS])
12301c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skip] = _len(code) - skip
12401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            elif _simple(av) and op is not REPEAT:
12501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                if op is MAX_REPEAT:
12641c99e7f96f7a0f192839801c568d8a80dcc7091Guido van Rossum                    emit(OPCODES[REPEAT_ONE])
12741c99e7f96f7a0f192839801c568d8a80dcc7091Guido van Rossum                else:
12841c99e7f96f7a0f192839801c568d8a80dcc7091Guido van Rossum                    emit(OPCODES[MIN_REPEAT_ONE])
12901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                skip = _len(code); emit(0)
130e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(av[0])
131e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(av[1])
132e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                _compile(code, av[2], flags)
133e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(OPCODES[SUCCESS])
13401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skip] = _len(code) - skip
13590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            else:
136e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(OPCODES[REPEAT])
13701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                skip = _len(code); emit(0)
138e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(av[0])
139e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                emit(av[1])
140e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                _compile(code, av[2], flags)
14101c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skip] = _len(code) - skip
14201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                if op is MAX_REPEAT:
143e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                    emit(OPCODES[MAX_UNTIL])
14490a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                else:
145e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh                    emit(OPCODES[MIN_UNTIL])
14690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        elif op is SUBPATTERN:
14729c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh            if av[0]:
14890a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[MARK])
14929c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh                emit((av[0]-1)*2)
1507898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            # _compile_info(code, av[1], flags)
15190a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            _compile(code, av[1], flags)
15229c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh            if av[0]:
15390a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                emit(OPCODES[MARK])
15429c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh                emit((av[0]-1)*2+1)
15501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger        elif op in SUCCESS_CODES:
15643b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(OPCODES[op])
15701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger        elif op in ASSERT_CODES:
1586f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh            emit(OPCODES[op])
15901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            skip = _len(code); emit(0)
1606f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh            if av[0] >= 0:
1616f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh                emit(0) # look ahead
1626f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh            else:
1636f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh                lo, hi = av[1].getwidth()
1646f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh                if lo != hi:
1656f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh                    raise error, "look-behind requires fixed-width pattern"
1666f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh                emit(lo) # look behind
1676f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh            _compile(code, av[1], flags)
1686f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh            emit(OPCODES[SUCCESS])
16901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            code[skip] = _len(code) - skip
1706f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh        elif op is CALL:
17143b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(OPCODES[op])
17201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            skip = _len(code); emit(0)
17343b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            _compile(code, av, flags)
17443b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(OPCODES[SUCCESS])
17501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            code[skip] = _len(code) - skip
17643b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh        elif op is AT:
17743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(OPCODES[op])
17843b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            if flags & SRE_FLAG_MULTILINE:
179b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh                av = AT_MULTILINE.get(av, av)
180b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh            if flags & SRE_FLAG_LOCALE:
181b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh                av = AT_LOCALE.get(av, av)
182b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh            elif flags & SRE_FLAG_UNICODE:
183b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh                av = AT_UNICODE.get(av, av)
184b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh            emit(ATCODES[av])
18543b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh        elif op is BRANCH:
18629c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh            emit(OPCODES[op])
18743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            tail = []
18801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            tailappend = tail.append
18943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            for av in av[1]:
19001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                skip = _len(code); emit(0)
1917898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                # _compile_info(code, av, flags)
19243b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh                _compile(code, av, flags)
19343b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh                emit(OPCODES[JUMP])
19401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                tailappend(_len(code)); emit(0)
19501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skip] = _len(code) - skip
19643b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(0) # end of branch
19743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            for tail in tail:
19801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[tail] = _len(code) - tail
19943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh        elif op is CATEGORY:
20043b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(OPCODES[op])
20143b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            if flags & SRE_FLAG_LOCALE:
202b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh                av = CH_LOCALE[av]
20343b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            elif flags & SRE_FLAG_UNICODE:
204b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh                av = CH_UNICODE[av]
205b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh            emit(CHCODES[av])
20672b82ba16dea929b3fa9db5208b2353e8449c2d5Fredrik Lundh        elif op is GROUPREF:
20743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            if flags & SRE_FLAG_IGNORECASE:
20843b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh                emit(OPCODES[OP_IGNORE[op]])
20943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            else:
21043b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh                emit(OPCODES[op])
21143b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh            emit(av-1)
212ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer        elif op is GROUPREF_EXISTS:
213ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer            emit(OPCODES[op])
214c30faa812c507c94d744419bce7c497f1a283d95Andrew M. Kuchling            emit(av[0]-1)
21501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger            skipyes = _len(code); emit(0)
216ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer            _compile(code, av[1], flags)
217ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer            if av[2]:
218ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer                emit(OPCODES[JUMP])
21901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                skipno = _len(code); emit(0)
22001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skipyes] = _len(code) - skipyes + 1
221ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer                _compile(code, av[2], flags)
22201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skipno] = _len(code) - skipno
223ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer            else:
22401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                code[skipyes] = _len(code) - skipyes + 1
22590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        else:
22690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            raise ValueError, ("unsupported operand type", op)
2277627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum
228e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchakadef _compile_charset(charset, flags, code, fixup=None, fixes=None):
2297898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    # compile charset subprogram
2307898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    emit = code.append
231e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka    for op, av in _optimize_charset(charset, fixup, fixes,
232e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                                    flags & SRE_FLAG_UNICODE):
2337898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        emit(OPCODES[op])
2347898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        if op is NEGATE:
2357898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            pass
2367898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        elif op is LITERAL:
237e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            emit(av)
2387898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        elif op is RANGE:
239e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            emit(av[0])
240e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            emit(av[1])
2417898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        elif op is CHARSET:
2427898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            code.extend(av)
24319af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh        elif op is BIGCHARSET:
24419af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh            code.extend(av)
2457898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        elif op is CATEGORY:
2467898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            if flags & SRE_FLAG_LOCALE:
2477898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                emit(CHCODES[CH_LOCALE[av]])
2487898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            elif flags & SRE_FLAG_UNICODE:
2497898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                emit(CHCODES[CH_UNICODE[av]])
2507898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            else:
2517898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                emit(CHCODES[av])
2527898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        else:
2537898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            raise error, "internal: unsupported set operator"
2547898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    emit(OPCODES[FAILURE])
2557898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh
256e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchakadef _optimize_charset(charset, fixup, fixes, isunicode):
2577898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    # internal: optimize character set
2587898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    out = []
259c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    tail = []
260c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    charmap = bytearray(256)
261c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    for op, av in charset:
262c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        while True:
263c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            try:
264c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                if op is LITERAL:
265e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    if fixup:
266e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        i = fixup(av)
267e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        charmap[i] = 1
268e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        if fixes and i in fixes:
269e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                            for k in fixes[i]:
270e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                                charmap[k] = 1
271e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    else:
272e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        charmap[av] = 1
273c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                elif op is RANGE:
274e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    r = range(av[0], av[1]+1)
275e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    if fixup:
276e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                        r = map(fixup, r)
277e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    if fixup and fixes:
278e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        for i in r:
279e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                            charmap[i] = 1
280e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                            if i in fixes:
281e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                                for k in fixes[i]:
282e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                                    charmap[k] = 1
283e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                    else:
284e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                        for i in r:
285e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka                            charmap[i] = 1
286c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                elif op is NEGATE:
287c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                    out.append((op, av))
288c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                else:
289c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                    tail.append((op, av))
290c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            except IndexError:
291c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                if len(charmap) == 256:
292c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                    # character set contains non-UCS1 character codes
293c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                    charmap += b'\0' * 0xff00
294c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                    continue
295c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                # character set contains non-BMP character codes
296e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                if fixup and isunicode and op is RANGE:
297e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    lo, hi = av
298e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    ranges = [av]
299e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    # There are only two ranges of cased astral characters:
300e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
301e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    _fixup_range(max(0x10000, lo), min(0x11fff, hi),
302e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                                 ranges, fixup)
303e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    for lo, hi in ranges:
304e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                        if lo == hi:
305e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                            tail.append((LITERAL, hi))
306e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                        else:
307e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                            tail.append((RANGE, (lo, hi)))
308e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                else:
309e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    tail.append((op, av))
310c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            break
311c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka
3127898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    # compress character map
3137898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    runs = []
314c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    q = 0
315c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    while True:
316c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        p = charmap.find(b'\1', q)
317c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        if p < 0:
318c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            break
319c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        if len(runs) >= 2:
320c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            runs = None
321c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            break
322c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        q = charmap.find(b'\0', p)
323c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        if q < 0:
324c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            runs.append((p, len(charmap)))
325c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            break
326c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        runs.append((p, q))
327c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    if runs is not None:
3287898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        # use literal/range
329c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        for p, q in runs:
330c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            if q - p == 1:
331c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                out.append((LITERAL, p))
3327898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            else:
333c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka                out.append((RANGE, (p, q - 1)))
334c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        out += tail
335e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka        # if the case was changed or new representation is more compact
336e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka        if fixup or len(out) < len(charset):
3377898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            return out
338e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka        # else original character set is good enough
339c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        return charset
340c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka
341c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # use bitmap
342c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    if len(charmap) == 256:
34319af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh        data = _mk_bitmap(charmap)
344c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        out.append((CHARSET, data))
345c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        out += tail
3467898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        return out
3477898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh
348c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # To represent a big charset, first a bitmap of all characters in the
349c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # set is constructed. Then, this bitmap is sliced into chunks of 256
350c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # characters, duplicate chunks are eliminated, and each chunk is
351c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # given a number. In the compiled expression, the charset is
352c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # represented by a 32-bit word sequence, consisting of one word for
353c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # the number of different chunks, a sequence of 256 bytes (64 words)
354c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # of chunk numbers indexed by their original chunk position, and a
355c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # sequence of 256-bit chunks (8 words each).
35619af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh
357c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # Compression is normally good: in a typical charset, large ranges of
358c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # Unicode will be either completely excluded (e.g. if only cyrillic
359c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # letters are to be matched), or completely included (e.g. if large
360c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # subranges of Kanji match). These ranges will be represented by
361c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # chunks of all one-bits or all zero-bits.
36219af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh
363c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # Matching can be also done efficiently: the more significant byte of
364c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # the Unicode character is an index into the chunk number, and the
365c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # less significant byte is a bit index in the chunk (just like the
366c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # CHARSET matching).
36719af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh
368c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
369c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # of the basic multilingual plane; an efficient representation
370c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # for all of Unicode has not yet been developed.
37119af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh
372c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    charmap = bytes(charmap) # should be hashable
37319af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh    comps = {}
374c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    mapping = bytearray(256)
37519af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh    block = 0
376c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    data = bytearray()
377c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    for i in range(0, 65536, 256):
378c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        chunk = charmap[i: i + 256]
379c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        if chunk in comps:
380c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            mapping[i // 256] = comps[chunk]
381c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka        else:
382c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            mapping[i // 256] = comps[chunk] = block
383c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            block += 1
384c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            data += chunk
385c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    data = _mk_bitmap(data)
386c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    data[0:0] = [block] + _bytes_to_codes(mapping)
387c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    out.append((BIGCHARSET, data))
388c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    out += tail
389c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    return out
390c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka
391e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchakadef _fixup_range(lo, hi, ranges, fixup):
392e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka    for i in map(fixup, range(lo, hi+1)):
393e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka        for k, (lo, hi) in enumerate(ranges):
394e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            if i < lo:
395e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                if l == lo - 1:
396e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    ranges[k] = (i, hi)
397e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                else:
398e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    ranges.insert(k, (i, i))
399e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                break
400e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            elif i > hi:
401e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                if i == hi + 1:
402e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    ranges[k] = (lo, i)
403e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                    break
404e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            else:
405e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka                break
406e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka        else:
407e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka            ranges.append((i, i))
408e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka
409c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka_CODEBITS = _sre.CODESIZE * 8
410c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka_BITS_TRANS = b'0' + b'1' * 255
411c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchakadef _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
412c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    s = bytes(bits).translate(_BITS_TRANS)[::-1]
413c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    return [_int(s[i - _CODEBITS: i], 2)
414c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka            for i in range(len(s), 0, -_CODEBITS)]
415c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka
416c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchakadef _bytes_to_codes(b):
417c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    # Convert block indices to word array
418c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    import array
4197d9c6c7e8c1e381de7e96989c1332cf98d766f3aMartin v. Löwis    if _sre.CODESIZE == 2:
42078e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis        code = 'H'
42178e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis    else:
4227d9c6c7e8c1e381de7e96989c1332cf98d766f3aMartin v. Löwis        code = 'I'
423c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    a = array.array(code, bytes(b))
424c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    assert a.itemsize == _sre.CODESIZE
425c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    assert len(a) * a.itemsize == len(b)
426c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka    return a.tolist()
42719af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh
4287898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundhdef _simple(av):
4297898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    # check if av is a "simple" operator
4307898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    lo, hi = av[2].getwidth()
4317898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
4327898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh
43329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundhdef _compile_info(code, pattern, flags):
43429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # internal: compile an info block.  in the current version,
4353562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    # this contains min/max pattern width, and an optional literal
4363562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    # prefix or a character map
43729c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    lo, hi = pattern.getwidth()
43829c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    if lo == 0:
43990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        return # not worth it
44029c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # look for a literal prefix
44129c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    prefix = []
44201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger    prefixappend = prefix.append
4437898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh    prefix_skip = 0
4443562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    charset = [] # not used
44501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger    charsetappend = charset.append
44629c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    if not (flags & SRE_FLAG_IGNORECASE):
4477898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        # look for literal prefix
44890a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        for op, av in pattern.data:
44990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            if op is LITERAL:
4507898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                if len(prefix) == prefix_skip:
4517898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    prefix_skip = prefix_skip + 1
45201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                prefixappend(av)
4537898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            elif op is SUBPATTERN and len(av[1]) == 1:
4547898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                op, av = av[1][0]
4557898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                if op is LITERAL:
45601c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                    prefixappend(av)
4577898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                else:
4587898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    break
45990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh            else:
46090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh                break
4617898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        # if no prefix, look for charset prefix
4627898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        if not prefix and pattern.data:
4637898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            op, av = pattern.data[0]
4647898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            if op is SUBPATTERN and av[1]:
4657898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                op, av = av[1][0]
4667898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                if op is LITERAL:
46701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                    charsetappend((op, av))
4687898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                elif op is BRANCH:
4697898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    c = []
47001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                    cappend = c.append
4717898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    for p in av[1]:
4727898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        if not p:
4737898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                            break
4747898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        op, av = p[0]
4757898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        if op is LITERAL:
47601c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                            cappend((op, av))
4777898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        else:
4787898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                            break
4797898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    else:
4807898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        charset = c
4817898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            elif op is BRANCH:
4827898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                c = []
48301c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                cappend = c.append
4847898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                for p in av[1]:
4857898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    if not p:
4867898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        break
4877898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    op, av = p[0]
4887898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    if op is LITERAL:
48901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger                        cappend((op, av))
4907898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    else:
4917898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                        break
4927898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                else:
4937898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                    charset = c
4947898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            elif op is IN:
4957898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                charset = av
4967898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh##     if prefix:
4977898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh##         print "*** PREFIX", prefix, prefix_skip
4987898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh##     if charset:
4997898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh##         print "*** CHARSET", charset
50029c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # add an info block
50129c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    emit = code.append
50229c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    emit(OPCODES[INFO])
50329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    skip = len(code); emit(0)
50429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # literal flag
50529c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    mask = 0
5063562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    if prefix:
5073562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh        mask = SRE_INFO_PREFIX
5087898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        if len(prefix) == prefix_skip == len(pattern.data):
5093562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh            mask = mask + SRE_INFO_LITERAL
5103562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    elif charset:
5113562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh        mask = mask + SRE_INFO_CHARSET
51229c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    emit(mask)
51329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # pattern length
5143562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    if lo < MAXCODE:
5153562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh        emit(lo)
5163562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    else:
5173562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh        emit(MAXCODE)
5183562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh        prefix = prefix[:MAXCODE]
5193562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    if hi < MAXCODE:
52090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        emit(hi)
52129c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    else:
52290a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        emit(0)
52329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # add literal prefix
52429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    if prefix:
5257898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        emit(len(prefix)) # length
5267898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        emit(prefix_skip) # skip
5277898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        code.extend(prefix)
5287898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        # generate overlap table
5297898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        table = [-1] + ([0]*len(prefix))
53001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger        for i in xrange(len(prefix)):
5317898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            table[i+1] = table[i]+1
5327898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
5337898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh                table[i+1] = table[table[i+1]-1]+1
5347898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh        code.extend(table[1:]) # don't store first entry
5353562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh    elif charset:
536577fb5a1db508444eed7bd78fbf76c1f64ed643bGuido van Rossum        _compile_charset(charset, flags, code)
53729c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    code[skip] = len(code) - skip
53829c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
5398a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundhtry:
54012723baceab61f8812d68575c962696cc4e77fa1Just van Rossum    unicode
5418a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundhexcept NameError:
54274902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum    STRING_TYPES = (type(""),)
54312723baceab61f8812d68575c962696cc4e77fa1Just van Rossumelse:
54412723baceab61f8812d68575c962696cc4e77fa1Just van Rossum    STRING_TYPES = (type(""), type(unicode("")))
5458a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundh
54674902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossumdef isstring(obj):
54774902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum    for tp in STRING_TYPES:
54874902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum        if isinstance(obj, tp):
54974902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum            return 1
55074902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum    return 0
55174902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum
5522f2c67d7e5934bdf96835f3c4774388b3e654314Fredrik Lundhdef _code(p, flags):
55329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
554b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton    flags = p.pattern.flags | flags
555be2211e9401a0be96915c473ef99041beb5a4992Fredrik Lundh    code = []
55629c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
55729c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # compile info block
55829c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    _compile_info(code, p, flags)
55929c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
56029c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh    # compile the pattern
561b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton    _compile(code, p.data, flags)
56229c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
563b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton    code.append(OPCODES[SUCCESS])
56429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
56529c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh    return code
56629c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh
56729c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundhdef compile(p, flags=0):
56829c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh    # internal: convert pattern list to internal format
56929c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh
57074902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum    if isstring(p):
57129c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh        pattern = p
57229c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh        p = sre_parse.parse(p, flags)
57329c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh    else:
57429c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh        pattern = None
57529c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh
5762f2c67d7e5934bdf96835f3c4774388b3e654314Fredrik Lundh    code = _code(p, flags)
57729c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh
5788a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundh    # print code
5798a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundh
580770617b23e286f1147f9480b5f625e88e7badd50Fredrik Lundh    # XXX: <fl> get rid of this limitation!
5815e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh    if p.pattern.groups > 100:
5825e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh        raise AssertionError(
5835e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh            "sorry, but this version only supports 100 named groups"
5845e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh            )
58529c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh
586c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh    # map in either direction
587c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh    groupindex = p.pattern.groupdict
588c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh    indexgroup = [None] * p.pattern.groups
589c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh    for k, i in groupindex.items():
590c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh        indexgroup[i] = k
591c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh
592b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton    return _sre.compile(
593ae04c3356ed2aec0e9e2c39096a3ccd05722575aGuido van Rossum        pattern, flags | p.pattern.flags, code,
5946f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh        p.pattern.groups-1,
5956f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh        groupindex, indexgroup
59690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh        )
597