tokenize.py revision 4d8e859e8f0a209a7e999ce9cc0988156c795949
14d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# This module compiles a regular expression that recognizes Python tokens. 24d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# It is designed to match the working of the Python tokenizer exactly. 34d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# It takes care of everything except indentation; 44d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# note that un-escaped newlines are tokens, too. 54d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# tokenprog.regs[3] gives the location of the token without whitespace 64d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# It also defines various subexpressions, but doesn't compile them. 74d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# See the function test() below for an example of how to use. 84d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 94d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumimport regex 104d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 114d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# Note: to get a quoted backslash in a regexp, it must be quadrupled. 124d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 134d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumIgnore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?' 144d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 154d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumName = '[a-zA-Z_][a-zA-Z0-9_]*' 164d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 174d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumHexnumber = '0[xX][0-9a-fA-F]*[lL]?' 184d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumOctnumber = '0[0-7]*[lL]?' 194d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumDecnumber = '[1-9][0-9]*[lL]?' 204d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumIntnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber 214d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumExponent = '[eE][-+]?[0-9]+' 224d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumPointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?' 234d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumExpfloat = '[0-9]+' + Exponent 244d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumFloatnumber = Pointfloat + '\|' + Expfloat 254d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumNumber = Intnumber + '\|' + Floatnumber 264d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 274d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumString = '\'\(\\\\.\|[^\\\n\']\)*\'' 284d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 294d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumOperator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>' 304d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumBracket = '[][(){}]' 314d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumSpecial = '[:;.,`\n]' 324d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumFunny = Operator + '\|' + Bracket + '\|' + Special 334d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 344d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumPlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny 354d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 364d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumToken = Ignore + '\(' + PlainToken + '\)' 374d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 384d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumtry: 394d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum save_syntax = regex.set_syntax(0) # Use default syntax 404d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum tokenprog = regex.compile(Token) 414d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumfinally: 424d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum dummy = regex.set_syntax(save_syntax) # Restore original syntax 434d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 444d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 454d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumdef test(file): 464d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum f = open(file, 'r') 474d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum while 1: 484d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum line = f.readline() 494d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum if not line: break 504d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum i, n = 0, len(line) 514d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum while i < n: 524d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum j = tokenprog.match(line, i) 534d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum if j < 0: 544d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum print 'No token at', `line[i:i+20]` + '...' 554d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum i = i+1 564d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum else: 574d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum i = i+j 584d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum a, b = tokenprog.regs[3] 594d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum if a < b: 604d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum print 'Token:', `line[a:b]` 61