14710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# 24710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Secret Labs' Regular Expression Engine 34710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# 44710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# convert re-style regular expression to sre pattern 54710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# 64710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 74710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# 84710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# See the sre.py file for information on usage and redistribution. 94710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# 104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm"""Internal support module for sre""" 124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# XXX: show string offset and offending character for all errors 144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmimport sys 164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom sre_constants import * 184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmSPECIAL_CHARS = ".\\[{()*+?^$|" 204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmREPEAT_CHARS = "*+?{" 214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmDIGITS = set("0123456789") 234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmOCTDIGITS = set("01234567") 254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmHEXDIGITS = set("0123456789abcdefABCDEF") 264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmWHITESPACE = set(" \t\n\r\v\f") 284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmESCAPES = { 304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\a": (LITERAL, ord("\a")), 314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\b": (LITERAL, ord("\b")), 324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\f": (LITERAL, ord("\f")), 334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\n": (LITERAL, ord("\n")), 344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\r": (LITERAL, ord("\r")), 354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\t": (LITERAL, ord("\t")), 364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\v": (LITERAL, ord("\v")), 374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\\": (LITERAL, ord("\\")) 384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm} 394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmCATEGORIES = { 414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\A": (AT, AT_BEGINNING_STRING), # start of string 424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\b": (AT, AT_BOUNDARY), 434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\B": (AT, AT_NON_BOUNDARY), 444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm r"\Z": (AT, AT_END_STRING), # end of string 514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm} 524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmFLAGS = { 544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # standard flags 554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "i": SRE_FLAG_IGNORECASE, 564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "L": SRE_FLAG_LOCALE, 574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "m": SRE_FLAG_MULTILINE, 584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "s": SRE_FLAG_DOTALL, 594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "x": SRE_FLAG_VERBOSE, 604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # extensions 614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "t": SRE_FLAG_TEMPLATE, 624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "u": SRE_FLAG_UNICODE, 634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm} 644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Pattern: 664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # master pattern object. keeps track of global attributes 674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __init__(self): 684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.flags = 0 694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.open = [] 704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.groups = 1 714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.groupdict = {} 724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def opengroup(self, name=None): 734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm gid = self.groups 744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.groups = gid + 1 754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if name is not None: 764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm ogid = self.groupdict.get(name, None) 774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if ogid is not None: 784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, ("redefinition of group name %s as group %d; " 794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm "was group %d" % (repr(name), gid, ogid)) 804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.groupdict[name] = gid 814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.open.append(gid) 824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return gid 834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def closegroup(self, gid): 844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.open.remove(gid) 854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def checkgroup(self, gid): 864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return gid < self.groups and gid not in self.open 874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass SubPattern: 894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # a subpattern, in intermediate form 904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __init__(self, pattern, data=None): 914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.pattern = pattern 924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if data is None: 934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm data = [] 944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.data = data 954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.width = None 964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def dump(self, level=0): 974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm nl = 1 984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm seqtypes = type(()), type([]) 994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for op, av in self.data: 1004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print level*" " + op,; nl = 0 1014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if op == "in": 1024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # member sublanguage 1034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print; nl = 1 1044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for op, a in av: 1054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print (level+1)*" " + op, a 1064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif op == "branch": 1074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print; nl = 1 1084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i = 0 1094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for a in av[1]: 1104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if i > 0: 1114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print level*" " + "or" 1124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm a.dump(level+1); nl = 1 1134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i = i + 1 1144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif type(av) in seqtypes: 1154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for a in av: 1164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if isinstance(a, SubPattern): 1174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not nl: print 1184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm a.dump(level+1); nl = 1 1194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 1204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print a, ; nl = 0 1214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 1224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm print av, ; nl = 0 1234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not nl: print 1244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __repr__(self): 1254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return repr(self.data) 1264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __len__(self): 1274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return len(self.data) 1284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __delitem__(self, index): 1294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm del self.data[index] 1304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __getitem__(self, index): 1314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if isinstance(index, slice): 1324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return SubPattern(self.pattern, self.data[index]) 1334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return self.data[index] 1344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __setitem__(self, index, code): 1354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.data[index] = code 1364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def insert(self, index, code): 1374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.data.insert(index, code) 1384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def append(self, code): 1394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.data.append(code) 1404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def getwidth(self): 1414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # determine the width (min, max) for this subpattern 1424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.width: 1434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return self.width 1444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = hi = 0L 1454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) 1464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm REPEATCODES = (MIN_REPEAT, MAX_REPEAT) 1474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for op, av in self.data: 1484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if op is BRANCH: 1494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i = sys.maxint 1504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm j = 0 1514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for av in av[1]: 1524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm l, h = av.getwidth() 1534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i = min(i, l) 1544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm j = max(j, h) 1554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = lo + i 1564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = hi + j 1574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif op is CALL: 1584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i, j = av.getwidth() 1594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = lo + i 1604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = hi + j 1614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif op is SUBPATTERN: 1624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i, j = av[1].getwidth() 1634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = lo + i 1644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = hi + j 1654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif op in REPEATCODES: 1664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i, j = av[2].getwidth() 1674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = lo + long(i) * av[0] 1684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = hi + long(j) * av[1] 1694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif op in UNITCODES: 1704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = lo + 1 1714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = hi + 1 1724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif op == SUCCESS: 1734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 1744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) 1754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return self.width 1764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Tokenizer: 1784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __init__(self, string): 1794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.string = string 1804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.index = 0 1814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.__next() 1824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __next(self): 1834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.index >= len(self.string): 1844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.next = None 1854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return 1864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = self.string[self.index] 1874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char[0] == "\\": 1884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 1894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm c = self.string[self.index + 1] 1904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except IndexError: 1914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bogus escape (end of line)" 1924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = char + c 1934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.index = self.index + len(char) 1944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.next = char 1954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def match(self, char, skip=1): 1964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == self.next: 1974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if skip: 1984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.__next() 1994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return 1 2004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return 0 2014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def get(self): 2024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = self.next 2034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.__next() 2044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return this 2054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def tell(self): 2064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return self.index, self.next 2074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def seek(self, index): 2084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.index, self.next = index 2094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef isident(char): 2114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 2124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef isdigit(char): 2144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return "0" <= char <= "9" 2154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef isname(name): 2174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # check that group name is a valid string 2184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isident(name[0]): 2194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return False 2204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for char in name[1:]: 2214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isident(char) and not isdigit(char): 2224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return False 2234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return True 2244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _class_escape(source, escape): 2264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # handle escape code inside character class 2274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code = ESCAPES.get(escape) 2284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code: 2294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return code 2304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code = CATEGORIES.get(escape) 2314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code: 2324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return code 2334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 2344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm c = escape[1:2] 2354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if c == "x": 2364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # hexadecimal escape (exactly two digits) 2374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in HEXDIGITS and len(escape) < 4: 2384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape + source.get() 2394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape[2:] 2404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if len(escape) != 2: 2414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bogus escape: %s" % repr("\\" + escape) 2424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, int(escape, 16) & 0xff 2434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif c in OCTDIGITS: 2444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # octal escape (up to three digits) 2454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in OCTDIGITS and len(escape) < 4: 2464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape + source.get() 2474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape[1:] 2484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, int(escape, 8) & 0xff 2494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif c in DIGITS: 2504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bogus escape: %s" % repr(escape) 2514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if len(escape) == 2: 2524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, ord(escape[1]) 2534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except ValueError: 2544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pass 2554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bogus escape: %s" % repr(escape) 2564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _escape(source, escape, state): 2584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # handle escape code in expression 2594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code = CATEGORIES.get(escape) 2604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code: 2614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return code 2624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code = ESCAPES.get(escape) 2634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code: 2644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return code 2654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 2664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm c = escape[1:2] 2674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if c == "x": 2684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # hexadecimal escape 2694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in HEXDIGITS and len(escape) < 4: 2704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape + source.get() 2714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if len(escape) != 4: 2724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise ValueError 2734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, int(escape[2:], 16) & 0xff 2744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif c == "0": 2754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # octal escape 2764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in OCTDIGITS and len(escape) < 4: 2774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape + source.get() 2784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, int(escape[1:], 8) & 0xff 2794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif c in DIGITS: 2804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # octal escape *or* decimal group reference (sigh) 2814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.next in DIGITS: 2824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape + source.get() 2834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 2844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm source.next in OCTDIGITS): 2854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # got three octal digits; this is an octal escape 2864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm escape = escape + source.get() 2874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, int(escape[1:], 8) & 0xff 2884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # not an octal escape, so this is a group reference 2894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = int(escape[1:]) 2904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if group < state.groups: 2914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not state.checkgroup(group): 2924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "cannot refer to open group" 2934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return GROUPREF, group 2944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise ValueError 2954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if len(escape) == 2: 2964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return LITERAL, ord(escape[1]) 2974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except ValueError: 2984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pass 2994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bogus escape: %s" % repr(escape) 3004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _parse_sub(source, state, nested=1): 3024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # parse an alternation: a|b|c 3034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm items = [] 3054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm itemsappend = items.append 3064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sourcematch = source.match 3074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 3084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm itemsappend(_parse(source, state)) 3094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("|"): 3104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 3114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not nested: 3124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 3134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not source.next or sourcematch(")", 0): 3144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 3154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "pattern not properly closed" 3174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if len(items) == 1: 3194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return items[0] 3204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern = SubPattern(state) 3224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend = subpattern.append 3234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # check if all items share a common prefix 3254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 3264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm prefix = None 3274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for item in items: 3284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not item: 3294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 3304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if prefix is None: 3314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm prefix = item[0] 3324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif item[0] != prefix: 3334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 3344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # all subitems start with a common "prefix". 3364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # move it out of the branch 3374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for item in items: 3384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm del item[0] 3394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend(prefix) 3404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue # check next one 3414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 3424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # check if the branch can be replaced by a character set 3444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for item in items: 3454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if len(item) != 1 or item[0][0] != LITERAL: 3464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 3474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # we can store this as a character set instead of a 3494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # branch (the compiler may optimize this even more) 3504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm set = [] 3514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend = set.append 3524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for item in items: 3534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend(item[0]) 3544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((IN, set)) 3554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return subpattern 3564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern.append((BRANCH, (None, items))) 3584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return subpattern 3594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _parse_sub_cond(source, state, condgroup): 3614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm item_yes = _parse(source, state) 3624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.match("|"): 3634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm item_no = _parse(source, state) 3644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.match("|"): 3654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "conditional backref with more than two branches" 3664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm item_no = None 3684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.next and not source.match(")", 0): 3694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "pattern not properly closed" 3704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern = SubPattern(state) 3714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 3724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return subpattern 3734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_PATTERNENDERS = set("|)") 3754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_ASSERTCHARS = set("=!<") 3764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_LOOKBEHINDASSERTCHARS = set("=!") 3774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) 3784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef _parse(source, state): 3804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # parse a simple pattern 3814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern = SubPattern(state) 3824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # precompute constants into local variables 3844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend = subpattern.append 3854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sourceget = source.get 3864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sourcematch = source.match 3874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm _len = len 3884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm PATTERNENDERS = _PATTERNENDERS 3894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm ASSERTCHARS = _ASSERTCHARS 3904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS 3914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm REPEATCODES = _REPEATCODES 3924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 3944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.next in PATTERNENDERS: 3964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break # end of subpattern 3974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = sourceget() 3984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this is None: 3994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break # end of pattern 4004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 4014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if state.flags & SRE_FLAG_VERBOSE: 4024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # skip whitespace and comments 4034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this in WHITESPACE: 4044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 4054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this == "#": 4064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 4074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = sourceget() 4084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this in (None, "\n"): 4094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 4104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 4114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 4124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this and this[0] not in SPECIAL_CHARS: 4134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((LITERAL, ord(this))) 4144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 4154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "[": 4164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # character set 4174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm set = [] 4184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend = set.append 4194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm## if sourcematch(":"): 4204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm## pass # handle character classes 4214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("^"): 4224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend((NEGATE, None)) 4234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # check remaining characters 4244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm start = set[:] 4254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 4264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = sourceget() 4274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this == "]" and set != start: 4284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 4294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this and this[0] == "\\": 4304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code1 = _class_escape(source, this) 4314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this: 4324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code1 = LITERAL, ord(this) 4334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 4344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unexpected end of regular expression" 4354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("-"): 4364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # potential range 4374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = sourceget() 4384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this == "]": 4394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code1[0] is IN: 4404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code1 = code1[1][0] 4414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend(code1) 4424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend((LITERAL, ord("-"))) 4434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 4444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this: 4454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this[0] == "\\": 4464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code2 = _class_escape(source, this) 4474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 4484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code2 = LITERAL, ord(this) 4494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code1[0] != LITERAL or code2[0] != LITERAL: 4504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad character range" 4514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = code1[1] 4524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = code2[1] 4534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if hi < lo: 4544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad character range" 4554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend((RANGE, (lo, hi))) 4564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 4574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unexpected end of regular expression" 4584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 4594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if code1[0] is IN: 4604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code1 = code1[1][0] 4614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm setappend(code1) 4624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 4634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # XXX: <fl> should move set optimization to compiler! 4644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if _len(set)==1 and set[0][0] is LITERAL: 4654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend(set[0]) # optimization 4664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 4674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((NOT_LITERAL, set[1][1])) # optimization 4684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 4694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # XXX: <fl> should add charmap optimization here 4704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((IN, set)) 4714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 4724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this and this[0] in REPEAT_CHARS: 4734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # repeat previous item 4744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this == "?": 4754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm min, max = 0, 1 4764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "*": 4774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm min, max = 0, MAXREPEAT 4784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 4794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "+": 4804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm min, max = 1, MAXREPEAT 4814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "{": 4824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.next == "}": 4834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((LITERAL, ord(this))) 4844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 4854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm here = source.tell() 4864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm min, max = 0, MAXREPEAT 4874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = hi = "" 4884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in DIGITS: 4894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lo = lo + source.get() 4904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch(","): 4914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in DIGITS: 4924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = hi + sourceget() 4934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 4944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm hi = lo 4954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not sourcematch("}"): 4964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((LITERAL, ord(this))) 4974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm source.seek(here) 4984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 4994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if lo: 5004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm min = int(lo) 5014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if hi: 5024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm max = int(hi) 5034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if max < min: 5044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad repeat interval" 5054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 5064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "not supported" 5074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # figure out which item to repeat 5084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if subpattern: 5094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm item = subpattern[-1:] 5104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 5114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm item = None 5124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not item or (_len(item) == 1 and item[0][0] == AT): 5134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "nothing to repeat" 5144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if item[0][0] in REPEATCODES: 5154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "multiple repeat" 5164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("?"): 5174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern[-1] = (MIN_REPEAT, (min, max, item)) 5184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 5194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern[-1] = (MAX_REPEAT, (min, max, item)) 5204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 5214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == ".": 5224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((ANY, None)) 5234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 5244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "(": 5254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = 1 5264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = None 5274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm condgroup = None 5284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("?"): 5294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = 0 5304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # options 5314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("P"): 5324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # python extensions 5334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if sourcematch("<"): 5344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # named group: skip forward to end of name 5354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = "" 5364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 5374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 5384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char is None: 5394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unterminated name" 5404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == ">": 5414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 5424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = name + char 5434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = 1 5444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isname(name): 5454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad character in group name" 5464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif sourcematch("="): 5474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # named backreference 5484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = "" 5494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 5504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 5514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char is None: 5524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unterminated name" 5534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == ")": 5544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 5554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = name + char 5564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isname(name): 5574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad character in group name" 5584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm gid = state.groupdict.get(name) 5594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if gid is None: 5604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unknown group name" 5614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((GROUPREF, gid)) 5624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 5634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 5644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 5654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char is None: 5664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unexpected end of pattern" 5674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unknown specifier: ?P%s" % char 5684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif sourcematch(":"): 5694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # non-capturing group 5704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = 2 5714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif sourcematch("#"): 5724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # comment 5734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 5744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.next is None or source.next == ")": 5754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 5764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sourceget() 5774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not sourcematch(")"): 5784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unbalanced parenthesis" 5794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 5804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif source.next in ASSERTCHARS: 5814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # lookahead assertions 5824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 5834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm dir = 1 5844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == "<": 5854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if source.next not in LOOKBEHINDASSERTCHARS: 5864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "syntax error" 5874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm dir = -1 # lookbehind 5884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 5894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p = _parse_sub(source, state) 5904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not sourcematch(")"): 5914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unbalanced parenthesis" 5924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == "=": 5934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((ASSERT, (dir, p))) 5944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 5954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((ASSERT_NOT, (dir, p))) 5964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm continue 5974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif sourcematch("("): 5984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # conditional backreference group 5994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm condname = "" 6004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 6014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 6024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char is None: 6034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unterminated name" 6044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == ")": 6054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 6064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm condname = condname + char 6074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = 2 6084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if isname(condname): 6094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm condgroup = state.groupdict.get(condname) 6104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if condgroup is None: 6114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unknown group name" 6124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 6134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 6144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm condgroup = int(condname) 6154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except ValueError: 6164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad character in group name" 6174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 6184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # flags 6194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not source.next in FLAGS: 6204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unexpected end of pattern" 6214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while source.next in FLAGS: 6224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm state.flags = state.flags | FLAGS[sourceget()] 6234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if group: 6244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # parse group contents 6254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if group == 2: 6264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # anonymous group 6274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = None 6284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 6294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm group = state.opengroup(name) 6304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if condgroup: 6314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p = _parse_sub_cond(source, state, condgroup) 6324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 6334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p = _parse_sub(source, state) 6344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not sourcematch(")"): 6354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unbalanced parenthesis" 6364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if group is not None: 6374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm state.closegroup(group) 6384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((SUBPATTERN, (group, p))) 6394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 6404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 6414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sourceget() 6424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char is None: 6434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unexpected end of pattern" 6444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == ")": 6454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 6464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unknown extension" 6474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "^": 6494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend((AT, AT_BEGINNING)) 6504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this == "$": 6524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpattern.append((AT, AT_END)) 6534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif this and this[0] == "\\": 6554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm code = _escape(source, this, state) 6564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm subpatternappend(code) 6574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 6594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "parser error" 6604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return subpattern 6624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef parse(str, flags=0, pattern=None): 6644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # parse 're' pattern into list of (opcode, argument) tuples 6654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm source = Tokenizer(str) 6674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if pattern is None: 6694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pattern = Pattern() 6704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pattern.flags = flags 6714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pattern.str = str 6724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p = _parse_sub(source, pattern, 0) 6744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm tail = source.get() 6764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if tail == ")": 6774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unbalanced parenthesis" 6784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif tail: 6794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bogus characters at end of regular expression" 6804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if flags & SRE_FLAG_DEBUG: 6824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p.dump() 6834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 6854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # the VERBOSE flag was switched on inside the pattern. to be 6864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # on the safe side, we'll parse the whole thing again... 6874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return parse(str, p.pattern.flags) 6884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return p 6904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 6914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef parse_template(source, pattern): 6924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # parse 're' replacement string into list of literals and 6934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # group references 6944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm s = Tokenizer(source) 6954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sget = s.get 6964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p = [] 6974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm a = p.append 6984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def literal(literal, p=p, pappend=a): 6994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if p and p[-1][0] is LITERAL: 7004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm p[-1] = LITERAL, p[-1][1] + literal 7014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 7024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pappend((LITERAL, literal)) 7034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sep = source[:0] 7044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if type(sep) is type(""): 7054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm makechar = chr 7064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 7074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm makechar = unichr 7084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 7094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = sget() 7104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this is None: 7114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break # end of replacement string 7124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if this and this[0] == "\\": 7134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # group 7144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm c = this[1:2] 7154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if c == "g": 7164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = "" 7174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if s.match("<"): 7184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm while 1: 7194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm char = sget() 7204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char is None: 7214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unterminated group name" 7224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if char == ">": 7234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm break 7244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm name = name + char 7254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not name: 7264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad group name" 7274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 7284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm index = int(name) 7294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if index < 0: 7304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "negative group number" 7314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except ValueError: 7324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isname(name): 7334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "bad character in group name" 7344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 7354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm index = pattern.groupindex[name] 7364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except KeyError: 7374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise IndexError, "unknown group name" 7384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm a((MARK, index)) 7394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif c == "0": 7404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if s.next in OCTDIGITS: 7414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = this + sget() 7424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if s.next in OCTDIGITS: 7434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = this + sget() 7444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literal(makechar(int(this[1:], 8) & 0xff)) 7454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif c in DIGITS: 7464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm isoctal = False 7474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if s.next in DIGITS: 7484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = this + sget() 7494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if (c in OCTDIGITS and this[2] in OCTDIGITS and 7504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm s.next in OCTDIGITS): 7514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = this + sget() 7524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm isoctal = True 7534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literal(makechar(int(this[1:], 8) & 0xff)) 7544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isoctal: 7554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm a((MARK, int(this[1:]))) 7564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 7574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 7584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this = makechar(ESCAPES[this][1]) 7594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except KeyError: 7604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pass 7614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literal(this) 7624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 7634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literal(this) 7644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # convert template to groups and literals lists 7654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i = 0 7664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm groups = [] 7674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm groupsappend = groups.append 7684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literals = [None] * len(p) 7694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for c, s in p: 7704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if c is MARK: 7714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm groupsappend((i, s)) 7724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # literal[i] is already None 7734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 7744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literals[i] = s 7754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm i = i + 1 7764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return groups, literals 7774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 7784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef expand_template(template, match): 7794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm g = match.group 7804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm sep = match.string[:0] 7814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm groups, literals = template 7824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literals = literals[:] 7834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 7844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for index, group in groups: 7854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm literals[index] = s = g(group) 7864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if s is None: 7874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "unmatched group" 7884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except IndexError: 7894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise error, "invalid group reference" 7904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return sep.join(literals) 791