14710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm#
24710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Secret Labs' Regular Expression Engine
34710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm#
44710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# convert re-style regular expression to sre pattern
54710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm#
64710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
74710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm#
84710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# See the sre.py file for information on usage and redistribution.
94710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm#
104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm"""Internal support module for sre"""
124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# XXX: show string offset and offending character for all errors
144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmimport sys
164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom sre_constants import *
184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmSPECIAL_CHARS = ".\\[{()*+?^$|"
204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmREPEAT_CHARS = "*+?{"
214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmDIGITS = set("0123456789")
234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmOCTDIGITS = set("01234567")
254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmHEXDIGITS = set("0123456789abcdefABCDEF")
264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmWHITESPACE = set(" \t\n\r\v\f")
284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmESCAPES = {
304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\a": (LITERAL, ord("\a")),
314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\b": (LITERAL, ord("\b")),
324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\f": (LITERAL, ord("\f")),
334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\n": (LITERAL, ord("\n")),
344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\r": (LITERAL, ord("\r")),
354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\t": (LITERAL, ord("\t")),
364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\v": (LITERAL, ord("\v")),
374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\\": (LITERAL, ord("\\"))
384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm}
394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmCATEGORIES = {
414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\A": (AT, AT_BEGINNING_STRING), # start of string
424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\b": (AT, AT_BOUNDARY),
434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\B": (AT, AT_NON_BOUNDARY),
444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    r"\Z": (AT, AT_END_STRING), # end of string
514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm}
524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmFLAGS = {
544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # standard flags
554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "i": SRE_FLAG_IGNORECASE,
564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "L": SRE_FLAG_LOCALE,
574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "m": SRE_FLAG_MULTILINE,
584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "s": SRE_FLAG_DOTALL,
594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "x": SRE_FLAG_VERBOSE,
604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # extensions
614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "t": SRE_FLAG_TEMPLATE,
624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    "u": SRE_FLAG_UNICODE,
634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm}
644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Pattern:
664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # master pattern object.  keeps track of global attributes
674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self):
684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.flags = 0
694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.open = []
704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.groups = 1
714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.groupdict = {}
724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def opengroup(self, name=None):
734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        gid = self.groups
744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.groups = gid + 1
754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if name is not None:
764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            ogid = self.groupdict.get(name, None)
774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if ogid is not None:
784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, ("redefinition of group name %s as group %d; "
794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                              "was group %d" % (repr(name), gid,  ogid))
804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            self.groupdict[name] = gid
814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.open.append(gid)
824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return gid
834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def closegroup(self, gid):
844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.open.remove(gid)
854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def checkgroup(self, gid):
864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return gid < self.groups and gid not in self.open
874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass SubPattern:
894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # a subpattern, in intermediate form
904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self, pattern, data=None):
914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.pattern = pattern
924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if data is None:
934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            data = []
944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.data = data
954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.width = None
964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def dump(self, level=0):
974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        nl = 1
984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        seqtypes = type(()), type([])
994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for op, av in self.data:
1004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            print level*"  " + op,; nl = 0
1014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if op == "in":
1024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                # member sublanguage
1034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                print; nl = 1
1044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for op, a in av:
1054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    print (level+1)*"  " + op, a
1064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif op == "branch":
1074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                print; nl = 1
1084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                i = 0
1094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for a in av[1]:
1104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if i > 0:
1114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        print level*"  " + "or"
1124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    a.dump(level+1); nl = 1
1134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    i = i + 1
1144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif type(av) in seqtypes:
1154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for a in av:
1164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if isinstance(a, SubPattern):
1174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if not nl: print
1184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        a.dump(level+1); nl = 1
1194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    else:
1204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        print a, ; nl = 0
1214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
1224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                print av, ; nl = 0
1234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if not nl: print
1244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __repr__(self):
1254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return repr(self.data)
1264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __len__(self):
1274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return len(self.data)
1284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __delitem__(self, index):
1294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        del self.data[index]
1304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __getitem__(self, index):
1314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if isinstance(index, slice):
1324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return SubPattern(self.pattern, self.data[index])
1334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self.data[index]
1344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __setitem__(self, index, code):
1354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.data[index] = code
1364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def insert(self, index, code):
1374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.data.insert(index, code)
1384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def append(self, code):
1394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.data.append(code)
1404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def getwidth(self):
1414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # determine the width (min, max) for this subpattern
1424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if self.width:
1434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return self.width
1444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        lo = hi = 0L
1454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
1464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
1474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for op, av in self.data:
1484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if op is BRANCH:
1494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                i = sys.maxint
1504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                j = 0
1514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                for av in av[1]:
1524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    l, h = av.getwidth()
1534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    i = min(i, l)
1544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    j = max(j, h)
1554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                lo = lo + i
1564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                hi = hi + j
1574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif op is CALL:
1584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                i, j = av.getwidth()
1594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                lo = lo + i
1604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                hi = hi + j
1614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif op is SUBPATTERN:
1624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                i, j = av[1].getwidth()
1634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                lo = lo + i
1644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                hi = hi + j
1654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif op in REPEATCODES:
1664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                i, j = av[2].getwidth()
1674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                lo = lo + long(i) * av[0]
1684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                hi = hi + long(j) * av[1]
1694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif op in UNITCODES:
1704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                lo = lo + 1
1714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                hi = hi + 1
1724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif op == SUCCESS:
1734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                break
1744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
1754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self.width
1764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
1774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Tokenizer:
1784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __init__(self, string):
1794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.string = string
1804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.index = 0
1814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.__next()
1824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def __next(self):
1834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if self.index >= len(self.string):
1844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            self.next = None
1854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return
1864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        char = self.string[self.index]
1874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if char[0] == "\\":
1884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            try:
1894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                c = self.string[self.index + 1]
1904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            except IndexError:
1914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, "bogus escape (end of line)"
1924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            char = char + c
1934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.index = self.index + len(char)
1944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.next = char
1954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def match(self, char, skip=1):
1964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if char == self.next:
1974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if skip:
1984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                self.__next()
1994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return 1
2004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return 0
2014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def get(self):
2024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        this = self.next
2034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.__next()
2044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return this
2054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def tell(self):
2064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return self.index, self.next
2074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def seek(self, index):
2084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        self.index, self.next = index
2094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef isident(char):
2114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
2124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef isdigit(char):
2144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return "0" <= char <= "9"
2154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef isname(name):
2174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # check that group name is a valid string
2184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if not isident(name[0]):
2194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return False
2204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    for char in name[1:]:
2214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not isident(char) and not isdigit(char):
2224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return False
2234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return True
2244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _class_escape(source, escape):
2264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # handle escape code inside character class
2274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    code = ESCAPES.get(escape)
2284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if code:
2294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return code
2304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    code = CATEGORIES.get(escape)
2314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if code:
2324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return code
2334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    try:
2344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        c = escape[1:2]
2354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if c == "x":
2364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # hexadecimal escape (exactly two digits)
2374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            while source.next in HEXDIGITS and len(escape) < 4:
2384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                escape = escape + source.get()
2394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            escape = escape[2:]
2404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if len(escape) != 2:
2414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, "bogus escape: %s" % repr("\\" + escape)
2424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return LITERAL, int(escape, 16) & 0xff
2434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif c in OCTDIGITS:
2444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # octal escape (up to three digits)
2454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            while source.next in OCTDIGITS and len(escape) < 4:
2464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                escape = escape + source.get()
2474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            escape = escape[1:]
2484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return LITERAL, int(escape, 8) & 0xff
2494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif c in DIGITS:
2504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise error, "bogus escape: %s" % repr(escape)
2514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if len(escape) == 2:
2524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return LITERAL, ord(escape[1])
2534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    except ValueError:
2544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        pass
2554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    raise error, "bogus escape: %s" % repr(escape)
2564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
2574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _escape(source, escape, state):
2584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # handle escape code in expression
2594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    code = CATEGORIES.get(escape)
2604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if code:
2614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return code
2624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    code = ESCAPES.get(escape)
2634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if code:
2644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return code
2654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    try:
2664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        c = escape[1:2]
2674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if c == "x":
2684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # hexadecimal escape
2694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            while source.next in HEXDIGITS and len(escape) < 4:
2704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                escape = escape + source.get()
2714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if len(escape) != 4:
2724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise ValueError
2734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return LITERAL, int(escape[2:], 16) & 0xff
2744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif c == "0":
2754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # octal escape
2764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            while source.next in OCTDIGITS and len(escape) < 4:
2774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                escape = escape + source.get()
2784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return LITERAL, int(escape[1:], 8) & 0xff
2794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif c in DIGITS:
2804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # octal escape *or* decimal group reference (sigh)
2814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if source.next in DIGITS:
2824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                escape = escape + source.get()
2834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
2844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    source.next in OCTDIGITS):
2854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # got three octal digits; this is an octal escape
2864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    escape = escape + source.get()
2874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    return LITERAL, int(escape[1:], 8) & 0xff
2884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # not an octal escape, so this is a group reference
2894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            group = int(escape[1:])
2904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if group < state.groups:
2914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if not state.checkgroup(group):
2924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    raise error, "cannot refer to open group"
2934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                return GROUPREF, group
2944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise ValueError
2954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if len(escape) == 2:
2964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            return LITERAL, ord(escape[1])
2974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    except ValueError:
2984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        pass
2994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    raise error, "bogus escape: %s" % repr(escape)
3004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _parse_sub(source, state, nested=1):
3024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # parse an alternation: a|b|c
3034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    items = []
3054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    itemsappend = items.append
3064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    sourcematch = source.match
3074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    while 1:
3084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        itemsappend(_parse(source, state))
3094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if sourcematch("|"):
3104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            continue
3114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not nested:
3124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            break
3134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if not source.next or sourcematch(")", 0):
3144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            break
3154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
3164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise error, "pattern not properly closed"
3174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if len(items) == 1:
3194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return items[0]
3204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpattern = SubPattern(state)
3224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpatternappend = subpattern.append
3234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # check if all items share a common prefix
3254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    while 1:
3264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        prefix = None
3274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for item in items:
3284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if not item:
3294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                break
3304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if prefix is None:
3314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                prefix = item[0]
3324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif item[0] != prefix:
3334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                break
3344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
3354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # all subitems start with a common "prefix".
3364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # move it out of the branch
3374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            for item in items:
3384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                del item[0]
3394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            subpatternappend(prefix)
3404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            continue # check next one
3414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        break
3424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # check if the branch can be replaced by a character set
3444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    for item in items:
3454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if len(item) != 1 or item[0][0] != LITERAL:
3464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            break
3474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    else:
3484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # we can store this as a character set instead of a
3494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # branch (the compiler may optimize this even more)
3504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        set = []
3514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        setappend = set.append
3524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for item in items:
3534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            setappend(item[0])
3544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        subpatternappend((IN, set))
3554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return subpattern
3564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpattern.append((BRANCH, (None, items)))
3584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return subpattern
3594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _parse_sub_cond(source, state, condgroup):
3614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    item_yes = _parse(source, state)
3624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if source.match("|"):
3634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        item_no = _parse(source, state)
3644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if source.match("|"):
3654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise error, "conditional backref with more than two branches"
3664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    else:
3674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        item_no = None
3684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if source.next and not source.match(")", 0):
3694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        raise error, "pattern not properly closed"
3704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpattern = SubPattern(state)
3714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
3724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return subpattern
3734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_PATTERNENDERS = set("|)")
3754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_ASSERTCHARS = set("=!<")
3764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_LOOKBEHINDASSERTCHARS = set("=!")
3774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
3784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _parse(source, state):
3804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # parse a simple pattern
3814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpattern = SubPattern(state)
3824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # precompute constants into local variables
3844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    subpatternappend = subpattern.append
3854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    sourceget = source.get
3864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    sourcematch = source.match
3874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    _len = len
3884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    PATTERNENDERS = _PATTERNENDERS
3894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    ASSERTCHARS = _ASSERTCHARS
3904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
3914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    REPEATCODES = _REPEATCODES
3924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    while 1:
3944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
3954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if source.next in PATTERNENDERS:
3964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            break # end of subpattern
3974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        this = sourceget()
3984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if this is None:
3994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            break # end of pattern
4004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if state.flags & SRE_FLAG_VERBOSE:
4024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # skip whitespace and comments
4034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if this in WHITESPACE:
4044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                continue
4054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if this == "#":
4064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                while 1:
4074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    this = sourceget()
4084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if this in (None, "\n"):
4094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        break
4104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                continue
4114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if this and this[0] not in SPECIAL_CHARS:
4134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            subpatternappend((LITERAL, ord(this)))
4144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this == "[":
4164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # character set
4174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            set = []
4184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            setappend = set.append
4194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm##          if sourcematch(":"):
4204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm##              pass # handle character classes
4214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if sourcematch("^"):
4224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                setappend((NEGATE, None))
4234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # check remaining characters
4244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            start = set[:]
4254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            while 1:
4264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                this = sourceget()
4274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if this == "]" and set != start:
4284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    break
4294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                elif this and this[0] == "\\":
4304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    code1 = _class_escape(source, this)
4314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                elif this:
4324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    code1 = LITERAL, ord(this)
4334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
4344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    raise error, "unexpected end of regular expression"
4354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if sourcematch("-"):
4364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # potential range
4374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    this = sourceget()
4384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if this == "]":
4394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if code1[0] is IN:
4404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            code1 = code1[1][0]
4414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        setappend(code1)
4424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        setappend((LITERAL, ord("-")))
4434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        break
4444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    elif this:
4454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if this[0] == "\\":
4464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            code2 = _class_escape(source, this)
4474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        else:
4484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            code2 = LITERAL, ord(this)
4494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if code1[0] != LITERAL or code2[0] != LITERAL:
4504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "bad character range"
4514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        lo = code1[1]
4524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        hi = code2[1]
4534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if hi < lo:
4544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "bad character range"
4554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        setappend((RANGE, (lo, hi)))
4564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    else:
4574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "unexpected end of regular expression"
4584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
4594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if code1[0] is IN:
4604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        code1 = code1[1][0]
4614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    setappend(code1)
4624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # XXX: <fl> should move set optimization to compiler!
4644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if _len(set)==1 and set[0][0] is LITERAL:
4654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                subpatternappend(set[0]) # optimization
4664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
4674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
4684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
4694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                # XXX: <fl> should add charmap optimization here
4704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                subpatternappend((IN, set))
4714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this and this[0] in REPEAT_CHARS:
4734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # repeat previous item
4744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if this == "?":
4754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                min, max = 0, 1
4764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif this == "*":
4774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                min, max = 0, MAXREPEAT
4784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
4794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif this == "+":
4804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                min, max = 1, MAXREPEAT
4814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif this == "{":
4824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if source.next == "}":
4834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    subpatternappend((LITERAL, ord(this)))
4844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    continue
4854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                here = source.tell()
4864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                min, max = 0, MAXREPEAT
4874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                lo = hi = ""
4884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                while source.next in DIGITS:
4894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    lo = lo + source.get()
4904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if sourcematch(","):
4914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    while source.next in DIGITS:
4924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        hi = hi + sourceget()
4934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
4944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    hi = lo
4954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if not sourcematch("}"):
4964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    subpatternappend((LITERAL, ord(this)))
4974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    source.seek(here)
4984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    continue
4994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if lo:
5004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    min = int(lo)
5014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if hi:
5024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    max = int(hi)
5034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if max < min:
5044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    raise error, "bad repeat interval"
5054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
5064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, "not supported"
5074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # figure out which item to repeat
5084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if subpattern:
5094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                item = subpattern[-1:]
5104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
5114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                item = None
5124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if not item or (_len(item) == 1 and item[0][0] == AT):
5134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, "nothing to repeat"
5144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if item[0][0] in REPEATCODES:
5154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, "multiple repeat"
5164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if sourcematch("?"):
5174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                subpattern[-1] = (MIN_REPEAT, (min, max, item))
5184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
5194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                subpattern[-1] = (MAX_REPEAT, (min, max, item))
5204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
5214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this == ".":
5224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            subpatternappend((ANY, None))
5234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
5244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this == "(":
5254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            group = 1
5264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            name = None
5274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            condgroup = None
5284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if sourcematch("?"):
5294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                group = 0
5304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                # options
5314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if sourcematch("P"):
5324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # python extensions
5334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if sourcematch("<"):
5344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        # named group: skip forward to end of name
5354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        name = ""
5364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        while 1:
5374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            char = sourceget()
5384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            if char is None:
5394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                raise error, "unterminated name"
5404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            if char == ">":
5414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                break
5424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            name = name + char
5434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        group = 1
5444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if not isname(name):
5454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "bad character in group name"
5464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    elif sourcematch("="):
5474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        # named backreference
5484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        name = ""
5494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        while 1:
5504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            char = sourceget()
5514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            if char is None:
5524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                raise error, "unterminated name"
5534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            if char == ")":
5544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                                break
5554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            name = name + char
5564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if not isname(name):
5574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "bad character in group name"
5584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        gid = state.groupdict.get(name)
5594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if gid is None:
5604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "unknown group name"
5614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        subpatternappend((GROUPREF, gid))
5624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        continue
5634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    else:
5644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        char = sourceget()
5654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if char is None:
5664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "unexpected end of pattern"
5674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "unknown specifier: ?P%s" % char
5684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                elif sourcematch(":"):
5694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # non-capturing group
5704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    group = 2
5714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                elif sourcematch("#"):
5724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # comment
5734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    while 1:
5744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if source.next is None or source.next == ")":
5754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            break
5764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        sourceget()
5774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if not sourcematch(")"):
5784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "unbalanced parenthesis"
5794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    continue
5804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                elif source.next in ASSERTCHARS:
5814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # lookahead assertions
5824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    char = sourceget()
5834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    dir = 1
5844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if char == "<":
5854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if source.next not in LOOKBEHINDASSERTCHARS:
5864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "syntax error"
5874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        dir = -1 # lookbehind
5884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        char = sourceget()
5894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    p = _parse_sub(source, state)
5904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if not sourcematch(")"):
5914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "unbalanced parenthesis"
5924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if char == "=":
5934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        subpatternappend((ASSERT, (dir, p)))
5944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    else:
5954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        subpatternappend((ASSERT_NOT, (dir, p)))
5964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    continue
5974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                elif sourcematch("("):
5984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # conditional backreference group
5994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    condname = ""
6004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    while 1:
6014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        char = sourceget()
6024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if char is None:
6034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "unterminated name"
6044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if char == ")":
6054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            break
6064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        condname = condname + char
6074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    group = 2
6084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if isname(condname):
6094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        condgroup = state.groupdict.get(condname)
6104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if condgroup is None:
6114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "unknown group name"
6124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    else:
6134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        try:
6144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            condgroup = int(condname)
6154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        except ValueError:
6164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "bad character in group name"
6174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
6184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # flags
6194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if not source.next in FLAGS:
6204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "unexpected end of pattern"
6214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    while source.next in FLAGS:
6224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        state.flags = state.flags | FLAGS[sourceget()]
6234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if group:
6244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                # parse group contents
6254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if group == 2:
6264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    # anonymous group
6274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    group = None
6284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
6294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    group = state.opengroup(name)
6304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if condgroup:
6314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    p = _parse_sub_cond(source, state, condgroup)
6324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                else:
6334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    p = _parse_sub(source, state)
6344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if not sourcematch(")"):
6354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    raise error, "unbalanced parenthesis"
6364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if group is not None:
6374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    state.closegroup(group)
6384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                subpatternappend((SUBPATTERN, (group, p)))
6394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
6404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                while 1:
6414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    char = sourceget()
6424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if char is None:
6434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "unexpected end of pattern"
6444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if char == ")":
6454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        break
6464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    raise error, "unknown extension"
6474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this == "^":
6494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            subpatternappend((AT, AT_BEGINNING))
6504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this == "$":
6524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            subpattern.append((AT, AT_END))
6534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        elif this and this[0] == "\\":
6554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            code = _escape(source, this, state)
6564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            subpatternappend(code)
6574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
6594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            raise error, "parser error"
6604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return subpattern
6624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef parse(str, flags=0, pattern=None):
6644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # parse 're' pattern into list of (opcode, argument) tuples
6654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    source = Tokenizer(str)
6674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if pattern is None:
6694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        pattern = Pattern()
6704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    pattern.flags = flags
6714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    pattern.str = str
6724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    p = _parse_sub(source, pattern, 0)
6744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    tail = source.get()
6764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if tail == ")":
6774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        raise error, "unbalanced parenthesis"
6784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    elif tail:
6794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        raise error, "bogus characters at end of regular expression"
6804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if flags & SRE_FLAG_DEBUG:
6824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        p.dump()
6834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
6854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # the VERBOSE flag was switched on inside the pattern.  to be
6864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        # on the safe side, we'll parse the whole thing again...
6874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        return parse(str, p.pattern.flags)
6884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return p
6904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
6914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef parse_template(source, pattern):
6924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # parse 're' replacement string into list of literals and
6934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # group references
6944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    s = Tokenizer(source)
6954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    sget = s.get
6964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    p = []
6974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    a = p.append
6984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    def literal(literal, p=p, pappend=a):
6994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if p and p[-1][0] is LITERAL:
7004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            p[-1] = LITERAL, p[-1][1] + literal
7014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
7024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            pappend((LITERAL, literal))
7034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    sep = source[:0]
7044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    if type(sep) is type(""):
7054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        makechar = chr
7064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    else:
7074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        makechar = unichr
7084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    while 1:
7094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        this = sget()
7104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if this is None:
7114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            break # end of replacement string
7124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if this and this[0] == "\\":
7134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # group
7144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            c = this[1:2]
7154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if c == "g":
7164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                name = ""
7174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if s.match("<"):
7184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    while 1:
7194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        char = sget()
7204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if char is None:
7214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            raise error, "unterminated group name"
7224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        if char == ">":
7234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                            break
7244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        name = name + char
7254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if not name:
7264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    raise error, "bad group name"
7274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                try:
7284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    index = int(name)
7294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if index < 0:
7304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "negative group number"
7314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                except ValueError:
7324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if not isname(name):
7334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise error, "bad character in group name"
7344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    try:
7354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        index = pattern.groupindex[name]
7364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    except KeyError:
7374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        raise IndexError, "unknown group name"
7384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                a((MARK, index))
7394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif c == "0":
7404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if s.next in OCTDIGITS:
7414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    this = this + sget()
7424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if s.next in OCTDIGITS:
7434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        this = this + sget()
7444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                literal(makechar(int(this[1:], 8) & 0xff))
7454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            elif c in DIGITS:
7464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                isoctal = False
7474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if s.next in DIGITS:
7484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    this = this + sget()
7494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
7504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        s.next in OCTDIGITS):
7514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        this = this + sget()
7524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        isoctal = True
7534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                        literal(makechar(int(this[1:], 8) & 0xff))
7544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                if not isoctal:
7554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    a((MARK, int(this[1:])))
7564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            else:
7574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                try:
7584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    this = makechar(ESCAPES[this][1])
7594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                except KeyError:
7604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                    pass
7614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                literal(this)
7624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
7634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            literal(this)
7644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    # convert template to groups and literals lists
7654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    i = 0
7664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    groups = []
7674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    groupsappend = groups.append
7684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    literals = [None] * len(p)
7694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    for c, s in p:
7704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        if c is MARK:
7714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            groupsappend((i, s))
7724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            # literal[i] is already None
7734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        else:
7744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            literals[i] = s
7754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        i = i + 1
7764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return groups, literals
7774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm
7784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef expand_template(template, match):
7794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    g = match.group
7804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    sep = match.string[:0]
7814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    groups, literals = template
7824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    literals = literals[:]
7834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    try:
7844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        for index, group in groups:
7854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            literals[index] = s = g(group)
7864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm            if s is None:
7874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm                raise error, "unmatched group"
7884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    except IndexError:
7894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm        raise error, "invalid group reference"
7904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm    return sep.join(literals)
791