tokenize.py revision 4d8e859e8f0a209a7e999ce9cc0988156c795949
14d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# This module compiles a regular expression that recognizes Python tokens.
24d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# It is designed to match the working of the Python tokenizer exactly.
34d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# It takes care of everything except indentation;
44d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# note that un-escaped newlines are tokens, too.
54d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# tokenprog.regs[3] gives the location of the token without whitespace
64d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# It also defines various subexpressions, but doesn't compile them.
74d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# See the function test() below for an example of how to use.
84d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
94d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumimport regex
104d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
114d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum# Note: to get a quoted backslash in a regexp, it must be quadrupled.
124d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
134d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumIgnore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
144d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
154d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumName = '[a-zA-Z_][a-zA-Z0-9_]*'
164d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
174d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumHexnumber = '0[xX][0-9a-fA-F]*[lL]?'
184d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumOctnumber = '0[0-7]*[lL]?'
194d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumDecnumber = '[1-9][0-9]*[lL]?'
204d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumIntnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
214d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumExponent = '[eE][-+]?[0-9]+'
224d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumPointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
234d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumExpfloat = '[0-9]+' + Exponent
244d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumFloatnumber = Pointfloat + '\|' + Expfloat
254d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumNumber = Intnumber + '\|' + Floatnumber
264d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
274d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumString = '\'\(\\\\.\|[^\\\n\']\)*\''
284d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
294d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumOperator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
304d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumBracket = '[][(){}]'
314d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumSpecial = '[:;.,`\n]'
324d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumFunny = Operator + '\|' + Bracket + '\|' + Special
334d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
344d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumPlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
354d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
364d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumToken = Ignore + '\(' + PlainToken + '\)'
374d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
384d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumtry:
394d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum	save_syntax = regex.set_syntax(0) # Use default syntax
404d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum	tokenprog = regex.compile(Token)
414d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumfinally:
424d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum	dummy = regex.set_syntax(save_syntax) # Restore original syntax
434d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
444d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
454d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossumdef test(file):
464d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum	f = open(file, 'r')
474d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum	while 1:
484d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum		line = f.readline()
494d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum		if not line: break
504d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum		i, n = 0, len(line)
514d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum		while i < n:
524d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum			j = tokenprog.match(line, i)
534d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum			if j < 0:
544d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum				print 'No token at', `line[i:i+20]` + '...'
554d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum				i = i+1
564d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum			else:
574d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum				i = i+j
584d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum				a, b = tokenprog.regs[3]
594d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum				if a < b:
604d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum					print 'Token:', `line[a:b]`
61