1dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#!/usr/bin/env python 2dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# 3dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Copyright 2007 Neal Norwitz 4dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Portions Copyright 2007 Google Inc. 5dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# 6dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Licensed under the Apache License, Version 2.0 (the "License"); 7dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# you may not use this file except in compliance with the License. 8dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# You may obtain a copy of the License at 9dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# 10dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# http://www.apache.org/licenses/LICENSE-2.0 11dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# 12dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Unless required by applicable law or agreed to in writing, software 13dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# distributed under the License is distributed on an "AS IS" BASIS, 14dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# See the License for the specific language governing permissions and 16dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# limitations under the License. 17dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 18dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter"""Tokenize C++ source code.""" 19dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 20dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter__author__ = 'nnorwitz@google.com (Neal Norwitz)' 21dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 22dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 23dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixtertry: 24dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Python 3.x 25dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter import builtins 26dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterexcept ImportError: 27dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Python 2.x 28dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter import __builtin__ as builtins 29dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 30dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 31dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterimport sys 32dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 33dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterfrom cpp import utils 34dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 35dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 36dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterif not hasattr(builtins, 'set'): 37dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Nominal support for Python 2.3. 38dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter from sets import Set as set 39dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 40dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 41dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Add $ as a valid identifier char since so much code uses it. 42dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter_letters = 'abcdefghijklmnopqrstuvwxyz' 43dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterVALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$') 44dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterHEX_DIGITS = set('0123456789abcdefABCDEF') 45dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterINT_OR_FLOAT_DIGITS = set('01234567890eE-+') 46dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 47dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 48dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# C++0x string preffixes. 49dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR')) 50dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 51dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 52dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Token types. 53dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterUNKNOWN = 'UNKNOWN' 54dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterSYNTAX = 'SYNTAX' 55dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterCONSTANT = 'CONSTANT' 56dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterNAME = 'NAME' 57dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterPREPROCESSOR = 'PREPROCESSOR' 58dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 59dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Where the token originated from. This can be used for backtracking. 60dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# It is always set to WHENCE_STREAM in this code. 61dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterWHENCE_STREAM, WHENCE_QUEUE = range(2) 62dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 63dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 64dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterclass Token(object): 65dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter """Data container to represent a C++ token. 66dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 67dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter Tokens can be identifiers, syntax char(s), constants, or 68dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter pre-processor directives. 69dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 70dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter start contains the index of the first char of the token in the source 71dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter end contains the index of the last char of the token in the source 72dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter """ 73dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 74dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter def __init__(self, token_type, name, start, end): 75dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter self.token_type = token_type 76dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter self.name = name 77dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter self.start = start 78dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter self.end = end 79dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter self.whence = WHENCE_STREAM 80dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 81dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter def __str__(self): 82dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if not utils.DEBUG: 83dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter return 'Token(%r)' % self.name 84dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter return 'Token(%r, %s, %s)' % (self.name, self.start, self.end) 85dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 86dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter __repr__ = __str__ 87dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 88dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 89dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterdef _GetString(source, start, i): 90dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find('"', i+1) 91dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[i-1] == '\\': 92dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Count the trailing backslashes. 93dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter backslash_count = 1 94dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter j = i - 2 95dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[j] == '\\': 96dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter backslash_count += 1 97dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter j -= 1 98dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # When trailing backslashes are even, they escape each other. 99dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if (backslash_count % 2) == 0: 100dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter break 101dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find('"', i+1) 102dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter return i + 1 103dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 104dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 105dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterdef _GetChar(source, start, i): 106dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # NOTE(nnorwitz): may not be quite correct, should be good enough. 107dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find("'", i+1) 108dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[i-1] == '\\': 109dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Need to special case '\\'. 110dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if (i - 2) > start and source[i-2] == '\\': 111dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter break 112dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find("'", i+1) 113dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Try to handle unterminated single quotes (in a #if 0 block). 114dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if i < 0: 115dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = start 116dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter return i + 1 117dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 118dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 119dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterdef GetTokens(source): 120dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter """Returns a sequence of Tokens. 121dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 122dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter Args: 123dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter source: string of C++ source code. 124dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 125dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter Yields: 126dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter Token that represents the next token in the source. 127dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter """ 128dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Cache various valid character sets for speed. 129dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter valid_identifier_chars = VALID_IDENTIFIER_CHARS 130dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter hex_digits = HEX_DIGITS 131dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter int_or_float_digits = INT_OR_FLOAT_DIGITS 132dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter int_or_float_digits2 = int_or_float_digits | set('.') 133dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 134dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Only ignore errors while in a #if 0 block. 135dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter ignore_errors = False 136dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter count_ifs = 0 137dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 138dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = 0 139dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter end = len(source) 140dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while i < end: 141dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Skip whitespace. 142dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while i < end and source[i].isspace(): 143dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 144dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if i >= end: 145dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter return 146dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 147dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = UNKNOWN 148dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter start = i 149dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter c = source[i] 150dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if c.isalpha() or c == '_': # Find a string token. 151dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = NAME 152dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[i] in valid_identifier_chars: 153dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 154dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # String and character constants can look like a name if 155dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # they are something like L"". 156dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if (source[i] == "'" and (i - start) == 1 and 157dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter source[start:i] in 'uUL'): 158dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # u, U, and L are valid C++0x character preffixes. 159dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = CONSTANT 160dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = _GetChar(source, start, i) 161dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif source[i] == "'" and source[start:i] in _STR_PREFIXES: 162dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = CONSTANT 163dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = _GetString(source, start, i) 164dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == '/' and source[i+1] == '/': # Find // comments. 165dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find('\n', i) 166dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if i == -1: # Handle EOF. 167dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = end 168dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter continue 169dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == '/' and source[i+1] == '*': # Find /* comments. */ 170dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find('*/', i) + 2 171dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter continue 172dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c in ':+-<>&|*=': # : or :: (plus other chars). 173dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = SYNTAX 174dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 175dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter new_ch = source[i] 176dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if new_ch == c: 177dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 178dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == '-' and new_ch == '>': 179dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 180dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif new_ch == '=': 181dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 182dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c in '()[]{}~!?^%;/.,': # Handle single char tokens. 183dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = SYNTAX 184dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 185dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if c == '.' and source[i].isdigit(): 186dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = CONSTANT 187dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 188dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[i] in int_or_float_digits: 189dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 190dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Handle float suffixes. 191dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter for suffix in ('l', 'f'): 192dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if suffix == source[i:i+1].lower(): 193dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 194dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter break 195dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c.isdigit(): # Find integer. 196dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = CONSTANT 197dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if c == '0' and source[i+1] in 'xX': 198dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Handle hex digits. 199dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 2 200dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[i] in hex_digits: 201dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 202dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter else: 203dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while source[i] in int_or_float_digits2: 204dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 205dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Handle integer (and float) suffixes. 206dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'): 207dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter size = len(suffix) 208dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if suffix == source[i:i+size].lower(): 209dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += size 210dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter break 211dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == '"': # Find string. 212dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = CONSTANT 213dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = _GetString(source, start, i) 214dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == "'": # Find char. 215dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = CONSTANT 216dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = _GetChar(source, start, i) 217dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == '#': # Find pre-processor command. 218dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter token_type = PREPROCESSOR 219dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace() 220dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if got_if: 221dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter count_ifs += 1 222dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif source[i:i+6] == '#endif': 223dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter count_ifs -= 1 224dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if count_ifs == 0: 225dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter ignore_errors = False 226dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 227dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # TODO(nnorwitz): handle preprocessor statements (\ continuations). 228dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter while 1: 229dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i1 = source.find('\n', i) 230dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i2 = source.find('//', i) 231dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i3 = source.find('/*', i) 232dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i4 = source.find('"', i) 233dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # NOTE(nnorwitz): doesn't handle comments in #define macros. 234dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Get the first important symbol (newline, comment, EOF/end). 235dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = min([x for x in (i1, i2, i3, i4, end) if x != -1]) 236dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 237dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Handle #include "dir//foo.h" properly. 238dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if source[i] == '"': 239dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i = source.find('"', i+1) + 1 240dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter assert i > 0 241dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter continue 242dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # Keep going if end of the line and the line ends with \. 243dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if not (i == i1 and source[i-1] == '\\'): 244dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if got_if: 245dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter condition = source[start+4:i].lstrip() 246dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if (condition.startswith('0') or 247dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter condition.startswith('(0)')): 248dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter ignore_errors = True 249dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter break 250dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 251dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif c == '\\': # Handle \ in code. 252dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # This is different from the pre-processor \ handling. 253dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 254dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter continue 255dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter elif ignore_errors: 256dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # The tokenizer seems to be in pretty good shape. This 257dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # raise is conditionally disabled so that bogus code 258dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # in an #if 0 block can be handled. Since we will ignore 259dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # it anyways, this is probably fine. So disable the 260dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # exception and return the bogus char. 261dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter i += 1 262dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter else: 263dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' % 264dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter ('?', i, c, source[i-10:i+10])) 265dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter raise RuntimeError('unexpected token') 266dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 267dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if i <= 0: 268dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter print('Invalid index, exiting now.') 269dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter return 270dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter yield Token(token_type, source[start:i], start, i) 271dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 272dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 273dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterif __name__ == '__main__': 274dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter def main(argv): 275dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter """Driver mostly for testing purposes.""" 276dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter for filename in argv[1:]: 277dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter source = utils.ReadFile(filename) 278dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter if source is None: 279dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter continue 280dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 281dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter for token in GetTokens(source): 282dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter print('%-12s: %s' % (token.token_type, token.name)) 283dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter # print('\r%6.2f%%' % (100.0 * index / token.end),) 284dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter sys.stdout.write('\n') 285dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 286dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter 287dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter main(sys.argv) 288