1dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#!/usr/bin/env python
2dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#
3dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Copyright 2007 Neal Norwitz
4dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Portions Copyright 2007 Google Inc.
5dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#
6dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Licensed under the Apache License, Version 2.0 (the "License");
7dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# you may not use this file except in compliance with the License.
8dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# You may obtain a copy of the License at
9dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#
10dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#      http://www.apache.org/licenses/LICENSE-2.0
11dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter#
12dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Unless required by applicable law or agreed to in writing, software
13dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# distributed under the License is distributed on an "AS IS" BASIS,
14dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# See the License for the specific language governing permissions and
16dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# limitations under the License.
17dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
18dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter"""Tokenize C++ source code."""
19dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
20dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter__author__ = 'nnorwitz@google.com (Neal Norwitz)'
21dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
22dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
23dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixtertry:
24dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # Python 3.x
25dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    import builtins
26dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterexcept ImportError:
27dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # Python 2.x
28dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    import __builtin__ as builtins
29dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
30dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
31dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterimport sys
32dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
33dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterfrom cpp import utils
34dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
35dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
36dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterif not hasattr(builtins, 'set'):
37dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # Nominal support for Python 2.3.
38dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    from sets import Set as set
39dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
40dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
41dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Add $ as a valid identifier char since so much code uses it.
42dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter_letters = 'abcdefghijklmnopqrstuvwxyz'
43dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterVALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
44dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterHEX_DIGITS = set('0123456789abcdefABCDEF')
45dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterINT_OR_FLOAT_DIGITS = set('01234567890eE-+')
46dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
47dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
48dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# C++0x string preffixes.
49dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
50dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
51dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
52dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Token types.
53dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterUNKNOWN = 'UNKNOWN'
54dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterSYNTAX = 'SYNTAX'
55dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterCONSTANT = 'CONSTANT'
56dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterNAME = 'NAME'
57dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterPREPROCESSOR = 'PREPROCESSOR'
58dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
59dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# Where the token originated from.  This can be used for backtracking.
60dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter# It is always set to WHENCE_STREAM in this code.
61dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken MixterWHENCE_STREAM, WHENCE_QUEUE = range(2)
62dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
63dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
64dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterclass Token(object):
65dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    """Data container to represent a C++ token.
66dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
67dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    Tokens can be identifiers, syntax char(s), constants, or
68dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    pre-processor directives.
69dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
70dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    start contains the index of the first char of the token in the source
71dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    end contains the index of the last char of the token in the source
72dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    """
73dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
74dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    def __init__(self, token_type, name, start, end):
75dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        self.token_type = token_type
76dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        self.name = name
77dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        self.start = start
78dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        self.end = end
79dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        self.whence = WHENCE_STREAM
80dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
81dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    def __str__(self):
82dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        if not utils.DEBUG:
83dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            return 'Token(%r)' % self.name
84dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
85dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
86dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    __repr__ = __str__
87dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
88dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
89dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterdef _GetString(source, start, i):
90dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    i = source.find('"', i+1)
91dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    while source[i-1] == '\\':
92dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        # Count the trailing backslashes.
93dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        backslash_count = 1
94dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        j = i - 2
95dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        while source[j] == '\\':
96dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            backslash_count += 1
97dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            j -= 1
98dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        # When trailing backslashes are even, they escape each other.
99dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        if (backslash_count % 2) == 0:
100dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            break
101dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        i = source.find('"', i+1)
102dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    return i + 1
103dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
104dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
105dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterdef _GetChar(source, start, i):
106dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # NOTE(nnorwitz): may not be quite correct, should be good enough.
107dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    i = source.find("'", i+1)
108dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    while source[i-1] == '\\':
109dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        # Need to special case '\\'.
110dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        if (i - 2) > start and source[i-2] == '\\':
111dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            break
112dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        i = source.find("'", i+1)
113dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # Try to handle unterminated single quotes (in a #if 0 block).
114dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    if i < 0:
115dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        i = start
116dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    return i + 1
117dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
118dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
119dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterdef GetTokens(source):
120dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    """Returns a sequence of Tokens.
121dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
122dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    Args:
123dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter      source: string of C++ source code.
124dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
125dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    Yields:
126dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter      Token that represents the next token in the source.
127dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    """
128dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # Cache various valid character sets for speed.
129dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    valid_identifier_chars = VALID_IDENTIFIER_CHARS
130dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    hex_digits = HEX_DIGITS
131dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    int_or_float_digits = INT_OR_FLOAT_DIGITS
132dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    int_or_float_digits2 = int_or_float_digits | set('.')
133dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
134dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    # Only ignore errors while in a #if 0 block.
135dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    ignore_errors = False
136dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    count_ifs = 0
137dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
138dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    i = 0
139dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    end = len(source)
140dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    while i < end:
141dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        # Skip whitespace.
142dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        while i < end and source[i].isspace():
143dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i += 1
144dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        if i >= end:
145dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            return
146dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
147dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        token_type = UNKNOWN
148dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        start = i
149dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        c = source[i]
150dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        if c.isalpha() or c == '_':              # Find a string token.
151dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = NAME
152dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            while source[i] in valid_identifier_chars:
153dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 1
154dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # String and character constants can look like a name if
155dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # they are something like L"".
156dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if (source[i] == "'" and (i - start) == 1 and
157dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                source[start:i] in 'uUL'):
158dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # u, U, and L are valid C++0x character preffixes.
159dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                token_type = CONSTANT
160dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i = _GetChar(source, start, i)
161dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                token_type = CONSTANT
163dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i = _GetString(source, start, i)
164dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c == '/' and source[i+1] == '/':    # Find // comments.
165dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i = source.find('\n', i)
166dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if i == -1:  # Handle EOF.
167dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i = end
168dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            continue
169dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
170dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i = source.find('*/', i) + 2
171dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            continue
172dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
173dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = SYNTAX
174dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i += 1
175dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            new_ch = source[i]
176dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if new_ch == c:
177dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 1
178dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            elif c == '-' and new_ch == '>':
179dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 1
180dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            elif new_ch == '=':
181dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 1
182dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
183dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = SYNTAX
184dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i += 1
185dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if c == '.' and source[i].isdigit():
186dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                token_type = CONSTANT
187dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 1
188dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                while source[i] in int_or_float_digits:
189dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    i += 1
190dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # Handle float suffixes.
191dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                for suffix in ('l', 'f'):
192dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    if suffix == source[i:i+1].lower():
193dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                        i += 1
194dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                        break
195dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c.isdigit():                        # Find integer.
196dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = CONSTANT
197dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if c == '0' and source[i+1] in 'xX':
198dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # Handle hex digits.
199dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 2
200dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                while source[i] in hex_digits:
201dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    i += 1
202dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            else:
203dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                while source[i] in int_or_float_digits2:
204dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    i += 1
205dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # Handle integer (and float) suffixes.
206dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
207dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                size = len(suffix)
208dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                if suffix == source[i:i+size].lower():
209dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    i += size
210dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    break
211dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c == '"':                           # Find string.
212dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = CONSTANT
213dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i = _GetString(source, start, i)
214dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c == "'":                           # Find char.
215dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = CONSTANT
216dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i = _GetChar(source, start, i)
217dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c == '#':                           # Find pre-processor command.
218dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            token_type = PREPROCESSOR
219dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
220dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if got_if:
221dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                count_ifs += 1
222dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            elif source[i:i+6] == '#endif':
223dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                count_ifs -= 1
224dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                if count_ifs == 0:
225dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    ignore_errors = False
226dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
227dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
228dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            while 1:
229dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i1 = source.find('\n', i)
230dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i2 = source.find('//', i)
231dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i3 = source.find('/*', i)
232dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i4 = source.find('"', i)
233dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # NOTE(nnorwitz): doesn't handle comments in #define macros.
234dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # Get the first important symbol (newline, comment, EOF/end).
235dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
236dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
237dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # Handle #include "dir//foo.h" properly.
238dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                if source[i] == '"':
239dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    i = source.find('"', i+1) + 1
240dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    assert i > 0
241dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    continue
242dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # Keep going if end of the line and the line ends with \.
243dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                if not (i == i1 and source[i-1] == '\\'):
244dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    if got_if:
245dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                        condition = source[start+4:i].lstrip()
246dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                        if (condition.startswith('0') or
247dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                            condition.startswith('(0)')):
248dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                            ignore_errors = True
249dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                    break
250dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                i += 1
251dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif c == '\\':                          # Handle \ in code.
252dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # This is different from the pre-processor \ handling.
253dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i += 1
254dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            continue
255dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        elif ignore_errors:
256dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # The tokenizer seems to be in pretty good shape.  This
257dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # raise is conditionally disabled so that bogus code
258dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # in an #if 0 block can be handled.  Since we will ignore
259dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # it anyways, this is probably fine.  So disable the
260dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            # exception and  return the bogus char.
261dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            i += 1
262dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        else:
263dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                             ('?', i, c, source[i-10:i+10]))
265dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            raise RuntimeError('unexpected token')
266dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
267dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        if i <= 0:
268dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            print('Invalid index, exiting now.')
269dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            return
270dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        yield Token(token_type, source[start:i], start, i)
271dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
272dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
273dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixterif __name__ == '__main__':
274dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    def main(argv):
275dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        """Driver mostly for testing purposes."""
276dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter        for filename in argv[1:]:
277dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            source = utils.ReadFile(filename)
278dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            if source is None:
279dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                continue
280dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
281dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            for token in GetTokens(source):
282dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                print('%-12s: %s' % (token.token_type, token.name))
283dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter                # print('\r%6.2f%%' % (100.0 * index / token.end),)
284dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter            sys.stdout.write('\n')
285dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
286dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter
287dd1c93d5709e32713961cfd95fe30489a4ad2d26Ken Mixter    main(sys.argv)
288