10c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi#
20c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Secret Labs' Regular Expression Engine
30c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi#
40c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# convert template to internal format
50c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi#
60c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
70c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi#
80c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# See the sre.py file for information on usage and redistribution.
90c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi#
100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi"""Internal support module for sre"""
120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiimport _sre, sys
140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiimport sre_parse
150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yifrom sre_constants import *
160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yifrom _sre import MAXREPEAT
170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiassert _sre.MAGIC == MAGIC, "SRE module mismatch"
190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiif _sre.CODESIZE == 2:
210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    MAXCODE = 65535
220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yielse:
230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    MAXCODE = 0xFFFFFFFFL
240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _identityfunction(x):
260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return x
270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_SUCCESS_CODES = set([SUCCESS, FAILURE])
310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _compile(code, pattern, flags):
340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # internal: compile a (sub)pattern
350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    emit = code.append
360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    _len = len
370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    LITERAL_CODES = _LITERAL_CODES
380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    REPEATING_CODES = _REPEATING_CODES
390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    SUCCESS_CODES = _SUCCESS_CODES
400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    ASSERT_CODES = _ASSERT_CODES
410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for op, av in pattern:
420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if op in LITERAL_CODES:
430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_IGNORECASE:
440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[OP_IGNORE[op]])
450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(_sre.getlower(av, flags))
460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[op])
480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av)
490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is IN:
500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_IGNORECASE:
510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[OP_IGNORE[op]])
520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                def fixup(literal, flags=flags):
530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    return _sre.getlower(literal, flags)
540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[op])
560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                fixup = _identityfunction
570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            skip = _len(code); emit(0)
580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            _compile_charset(av, flags, code, fixup)
590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            code[skip] = _len(code) - skip
600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is ANY:
610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_DOTALL:
620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[ANY_ALL])
630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[ANY])
650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op in REPEATING_CODES:
660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_TEMPLATE:
670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                raise error, "internal: unsupported template operator"
680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[REPEAT])
690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                skip = _len(code); emit(0)
700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av[0])
710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av[1])
720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                _compile(code, av[2], flags)
730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[SUCCESS])
740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skip] = _len(code) - skip
750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif _simple(av) and op is not REPEAT:
760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                if op is MAX_REPEAT:
770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    emit(OPCODES[REPEAT_ONE])
780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                else:
790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    emit(OPCODES[MIN_REPEAT_ONE])
800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                skip = _len(code); emit(0)
810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av[0])
820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av[1])
830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                _compile(code, av[2], flags)
840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[SUCCESS])
850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skip] = _len(code) - skip
860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[REPEAT])
880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                skip = _len(code); emit(0)
890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av[0])
900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(av[1])
910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                _compile(code, av[2], flags)
920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skip] = _len(code) - skip
930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                if op is MAX_REPEAT:
940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    emit(OPCODES[MAX_UNTIL])
950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                else:
960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    emit(OPCODES[MIN_UNTIL])
970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is SUBPATTERN:
980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if av[0]:
990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[MARK])
1000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit((av[0]-1)*2)
1010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            # _compile_info(code, av[1], flags)
1020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            _compile(code, av[1], flags)
1030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if av[0]:
1040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[MARK])
1050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit((av[0]-1)*2+1)
1060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op in SUCCESS_CODES:
1070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op in ASSERT_CODES:
1090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            skip = _len(code); emit(0)
1110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if av[0] >= 0:
1120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(0) # look ahead
1130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
1140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                lo, hi = av[1].getwidth()
1150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                if lo != hi:
1160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    raise error, "look-behind requires fixed-width pattern"
1170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(lo) # look behind
1180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            _compile(code, av[1], flags)
1190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[SUCCESS])
1200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            code[skip] = _len(code) - skip
1210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is CALL:
1220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            skip = _len(code); emit(0)
1240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            _compile(code, av, flags)
1250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[SUCCESS])
1260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            code[skip] = _len(code) - skip
1270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is AT:
1280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_MULTILINE:
1300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                av = AT_MULTILINE.get(av, av)
1310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_LOCALE:
1320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                av = AT_LOCALE.get(av, av)
1330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif flags & SRE_FLAG_UNICODE:
1340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                av = AT_UNICODE.get(av, av)
1350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(ATCODES[av])
1360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is BRANCH:
1370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            tail = []
1390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            tailappend = tail.append
1400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            for av in av[1]:
1410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                skip = _len(code); emit(0)
1420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                # _compile_info(code, av, flags)
1430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                _compile(code, av, flags)
1440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[JUMP])
1450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                tailappend(_len(code)); emit(0)
1460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skip] = _len(code) - skip
1470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(0) # end of branch
1480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            for tail in tail:
1490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[tail] = _len(code) - tail
1500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is CATEGORY:
1510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_LOCALE:
1530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                av = CH_LOCALE[av]
1540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif flags & SRE_FLAG_UNICODE:
1550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                av = CH_UNICODE[av]
1560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(CHCODES[av])
1570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is GROUPREF:
1580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_IGNORECASE:
1590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[OP_IGNORE[op]])
1600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
1610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[op])
1620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(av-1)
1630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is GROUPREF_EXISTS:
1640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(OPCODES[op])
1650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(av[0]-1)
1660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            skipyes = _len(code); emit(0)
1670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            _compile(code, av[1], flags)
1680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if av[2]:
1690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(OPCODES[JUMP])
1700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                skipno = _len(code); emit(0)
1710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skipyes] = _len(code) - skipyes + 1
1720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                _compile(code, av[2], flags)
1730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skipno] = _len(code) - skipno
1740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
1750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                code[skipyes] = _len(code) - skipyes + 1
1760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        else:
1770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            raise ValueError, ("unsupported operand type", op)
1780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
1790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _compile_charset(charset, flags, code, fixup=None):
1800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # compile charset subprogram
1810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    emit = code.append
1820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if fixup is None:
1830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        fixup = _identityfunction
1840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for op, av in _optimize_charset(charset, fixup):
1850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(OPCODES[op])
1860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if op is NEGATE:
1870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            pass
1880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is LITERAL:
1890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(fixup(av))
1900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is RANGE:
1910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(fixup(av[0]))
1920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            emit(fixup(av[1]))
1930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is CHARSET:
1940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            code.extend(av)
1950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is BIGCHARSET:
1960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            code.extend(av)
1970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif op is CATEGORY:
1980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if flags & SRE_FLAG_LOCALE:
1990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(CHCODES[CH_LOCALE[av]])
2000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif flags & SRE_FLAG_UNICODE:
2010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(CHCODES[CH_UNICODE[av]])
2020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
2030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                emit(CHCODES[av])
2040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        else:
2050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            raise error, "internal: unsupported set operator"
2060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    emit(OPCODES[FAILURE])
2070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
2080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _optimize_charset(charset, fixup):
2090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # internal: optimize character set
2100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    out = []
2110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    outappend = out.append
2120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    charmap = [0]*256
2130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    try:
2140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        for op, av in charset:
2150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if op is NEGATE:
2160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                outappend((op, av))
2170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is LITERAL:
2180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                charmap[fixup(av)] = 1
2190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is RANGE:
2200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                for i in range(fixup(av[0]), fixup(av[1])+1):
2210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    charmap[i] = 1
2220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is CATEGORY:
2230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                # XXX: could append to charmap tail
2240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                return charset # cannot compress
2250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    except IndexError:
2260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # character set contains unicode characters
2270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        return _optimize_unicode(charset, fixup)
2280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # compress character map
2290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    i = p = n = 0
2300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    runs = []
2310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    runsappend = runs.append
2320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for c in charmap:
2330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if c:
2340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if n == 0:
2350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                p = i
2360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            n = n + 1
2370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        elif n:
2380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            runsappend((p, n))
2390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            n = 0
2400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        i = i + 1
2410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if n:
2420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        runsappend((p, n))
2430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if len(runs) <= 2:
2440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # use literal/range
2450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        for p, n in runs:
2460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if n == 1:
2470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                outappend((LITERAL, p))
2480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
2490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                outappend((RANGE, (p, p+n-1)))
2500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if len(out) < len(charset):
2510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            return out
2520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    else:
2530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # use bitmap
2540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        data = _mk_bitmap(charmap)
2550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        outappend((CHARSET, data))
2560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        return out
2570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return charset
2580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
2590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _mk_bitmap(bits):
2600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    data = []
2610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    dataappend = data.append
2620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if _sre.CODESIZE == 2:
2630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        start = (1, 0)
2640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    else:
2650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        start = (1L, 0L)
2660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    m, v = start
2670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for c in bits:
2680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if c:
2690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            v = v + m
2700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        m = m + m
2710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if m > MAXCODE:
2720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            dataappend(v)
2730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            m, v = start
2740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return data
2750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
2760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# To represent a big charset, first a bitmap of all characters in the
2770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# set is constructed. Then, this bitmap is sliced into chunks of 256
2780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# characters, duplicate chunks are eliminated, and each chunk is
2790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# given a number. In the compiled expression, the charset is
2800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# represented by a 16-bit word sequence, consisting of one word for
2810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# the number of different chunks, a sequence of 256 bytes (128 words)
2820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# of chunk numbers indexed by their original chunk position, and a
2830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# sequence of chunks (16 words each).
2840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
2850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Compression is normally good: in a typical charset, large ranges of
2860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Unicode will be either completely excluded (e.g. if only cyrillic
2870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# letters are to be matched), or completely included (e.g. if large
2880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# subranges of Kanji match). These ranges will be represented by
2890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# chunks of all one-bits or all zero-bits.
2900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
2910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# Matching can be also done efficiently: the more significant byte of
2920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# the Unicode character is an index into the chunk number, and the
2930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# less significant byte is a bit index in the chunk (just like the
2940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# CHARSET matching).
2950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
2960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
2970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# of the basic multilingual plane; an efficient representation
2980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# for all of UTF-16 has not yet been developed. This means,
2990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# in particular, that negated charsets cannot be represented as
3000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi# bigcharsets.
3010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
3020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _optimize_unicode(charset, fixup):
3030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    try:
3040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        import array
3050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    except ImportError:
3060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        return charset
3070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    charmap = [0]*65536
3080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    negate = 0
3090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    try:
3100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        for op, av in charset:
3110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if op is NEGATE:
3120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                negate = 1
3130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is LITERAL:
3140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                charmap[fixup(av)] = 1
3150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is RANGE:
3160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                for i in xrange(fixup(av[0]), fixup(av[1])+1):
3170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    charmap[i] = 1
3180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is CATEGORY:
3190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                # XXX: could expand category
3200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                return charset # cannot compress
3210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    except IndexError:
3220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # non-BMP characters
3230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        return charset
3240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if negate:
3250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if sys.maxunicode != 65535:
3260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            # XXX: negation does not work with big charsets
3270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            return charset
3280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        for i in xrange(65536):
3290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            charmap[i] = not charmap[i]
3300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    comps = {}
3310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    mapping = [0]*256
3320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    block = 0
3330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    data = []
3340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for i in xrange(256):
3350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        chunk = tuple(charmap[i*256:(i+1)*256])
3360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        new = comps.setdefault(chunk, block)
3370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        mapping[i] = new
3380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if new == block:
3390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            block = block + 1
3400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            data = data + _mk_bitmap(chunk)
3410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    header = [block]
3420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if _sre.CODESIZE == 2:
3430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        code = 'H'
3440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    else:
3450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        code = 'I'
3460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # Convert block indices to byte array of 256 bytes
3470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    mapping = array.array('b', mapping).tostring()
3480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # Convert byte array to word array
3490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    mapping = array.array(code, mapping)
3500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    assert mapping.itemsize == _sre.CODESIZE
3510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    header = header + mapping.tolist()
3520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    data[0:0] = header
3530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return [(BIGCHARSET, data)]
3540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
3550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _simple(av):
3560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # check if av is a "simple" operator
3570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    lo, hi = av[2].getwidth()
3580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if lo == 0 and hi == MAXREPEAT:
3590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        raise error, "nothing to repeat"
3600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
3610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
3620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _compile_info(code, pattern, flags):
3630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # internal: compile an info block.  in the current version,
3640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # this contains min/max pattern width, and an optional literal
3650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # prefix or a character map
3660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    lo, hi = pattern.getwidth()
3670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if lo == 0:
3680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        return # not worth it
3690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # look for a literal prefix
3700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    prefix = []
3710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    prefixappend = prefix.append
3720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    prefix_skip = 0
3730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    charset = [] # not used
3740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    charsetappend = charset.append
3750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if not (flags & SRE_FLAG_IGNORECASE):
3760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # look for literal prefix
3770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        for op, av in pattern.data:
3780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if op is LITERAL:
3790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                if len(prefix) == prefix_skip:
3800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    prefix_skip = prefix_skip + 1
3810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                prefixappend(av)
3820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is SUBPATTERN and len(av[1]) == 1:
3830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                op, av = av[1][0]
3840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                if op is LITERAL:
3850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    prefixappend(av)
3860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                else:
3870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    break
3880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            else:
3890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                break
3900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # if no prefix, look for charset prefix
3910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if not prefix and pattern.data:
3920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            op, av = pattern.data[0]
3930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            if op is SUBPATTERN and av[1]:
3940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                op, av = av[1][0]
3950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                if op is LITERAL:
3960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    charsetappend((op, av))
3970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                elif op is BRANCH:
3980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    c = []
3990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    cappend = c.append
4000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    for p in av[1]:
4010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        if not p:
4020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                            break
4030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        op, av = p[0]
4040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        if op is LITERAL:
4050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                            cappend((op, av))
4060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        else:
4070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                            break
4080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    else:
4090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        charset = c
4100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is BRANCH:
4110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                c = []
4120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                cappend = c.append
4130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                for p in av[1]:
4140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    if not p:
4150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        break
4160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    op, av = p[0]
4170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    if op is LITERAL:
4180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        cappend((op, av))
4190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    else:
4200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                        break
4210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                else:
4220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                    charset = c
4230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            elif op is IN:
4240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                charset = av
4250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi##     if prefix:
4260c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi##         print "*** PREFIX", prefix, prefix_skip
4270c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi##     if charset:
4280c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi##         print "*** CHARSET", charset
4290c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # add an info block
4300c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    emit = code.append
4310c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    emit(OPCODES[INFO])
4320c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    skip = len(code); emit(0)
4330c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # literal flag
4340c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    mask = 0
4350c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if prefix:
4360c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        mask = SRE_INFO_PREFIX
4370c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if len(prefix) == prefix_skip == len(pattern.data):
4380c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            mask = mask + SRE_INFO_LITERAL
4390c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    elif charset:
4400c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        mask = mask + SRE_INFO_CHARSET
4410c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    emit(mask)
4420c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # pattern length
4430c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if lo < MAXCODE:
4440c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(lo)
4450c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    else:
4460c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(MAXCODE)
4470c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        prefix = prefix[:MAXCODE]
4480c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if hi < MAXCODE:
4490c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(hi)
4500c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    else:
4510c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(0)
4520c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # add literal prefix
4530c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if prefix:
4540c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(len(prefix)) # length
4550c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        emit(prefix_skip) # skip
4560c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        code.extend(prefix)
4570c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        # generate overlap table
4580c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        table = [-1] + ([0]*len(prefix))
4590c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        for i in xrange(len(prefix)):
4600c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            table[i+1] = table[i]+1
4610c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
4620c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi                table[i+1] = table[table[i+1]-1]+1
4630c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        code.extend(table[1:]) # don't store first entry
4640c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    elif charset:
4650c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        _compile_charset(charset, flags, code)
4660c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    code[skip] = len(code) - skip
4670c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4680c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yitry:
4690c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    unicode
4700c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yiexcept NameError:
4710c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    STRING_TYPES = (type(""),)
4720c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yielse:
4730c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    STRING_TYPES = (type(""), type(unicode("")))
4740c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4750c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef isstring(obj):
4760c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for tp in STRING_TYPES:
4770c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        if isinstance(obj, tp):
4780c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            return 1
4790c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return 0
4800c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4810c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef _code(p, flags):
4820c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4830c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    flags = p.pattern.flags | flags
4840c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    code = []
4850c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4860c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # compile info block
4870c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    _compile_info(code, p, flags)
4880c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4890c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # compile the pattern
4900c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    _compile(code, p.data, flags)
4910c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4920c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    code.append(OPCODES[SUCCESS])
4930c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4940c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return code
4950c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4960c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yidef compile(p, flags=0):
4970c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # internal: convert pattern list to internal format
4980c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
4990c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if isstring(p):
5000c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        pattern = p
5010c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        p = sre_parse.parse(p, flags)
5020c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    else:
5030c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        pattern = None
5040c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
5050c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    code = _code(p, flags)
5060c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
5070c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # print code
5080c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
5090c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # XXX: <fl> get rid of this limitation!
5100c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    if p.pattern.groups > 100:
5110c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        raise AssertionError(
5120c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            "sorry, but this version only supports 100 named groups"
5130c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi            )
5140c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
5150c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    # map in either direction
5160c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    groupindex = p.pattern.groupdict
5170c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    indexgroup = [None] * p.pattern.groups
5180c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    for k, i in groupindex.items():
5190c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        indexgroup[i] = k
5200c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi
5210c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi    return _sre.compile(
5220c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        pattern, flags | p.pattern.flags, code,
5230c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        p.pattern.groups-1,
5240c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        groupindex, indexgroup
5250c5958b1636c47ed7c284f859c8e805fd06a0e6Bill Yi        )
526