1#!/usr/bin/env python
2#
3# Copyright 2007 Neal Norwitz
4# Portions Copyright 2007 Google Inc.
5#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10#      http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18"""Tokenize C++ source code."""
19
20__author__ = 'nnorwitz@google.com (Neal Norwitz)'
21
22
23try:
24    # Python 3.x
25    import builtins
26except ImportError:
27    # Python 2.x
28    import __builtin__ as builtins
29
30
31import sys
32
33from cpp import utils
34
35
36if not hasattr(builtins, 'set'):
37    # Nominal support for Python 2.3.
38    from sets import Set as set
39
40
41# Add $ as a valid identifier char since so much code uses it.
42_letters = 'abcdefghijklmnopqrstuvwxyz'
43VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
44HEX_DIGITS = set('0123456789abcdefABCDEF')
45INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
46
47
48# C++0x string preffixes.
49_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
50
51
52# Token types.
53UNKNOWN = 'UNKNOWN'
54SYNTAX = 'SYNTAX'
55CONSTANT = 'CONSTANT'
56NAME = 'NAME'
57PREPROCESSOR = 'PREPROCESSOR'
58
59# Where the token originated from.  This can be used for backtracking.
60# It is always set to WHENCE_STREAM in this code.
61WHENCE_STREAM, WHENCE_QUEUE = range(2)
62
63
64class Token(object):
65    """Data container to represent a C++ token.
66
67    Tokens can be identifiers, syntax char(s), constants, or
68    pre-processor directives.
69
70    start contains the index of the first char of the token in the source
71    end contains the index of the last char of the token in the source
72    """
73
74    def __init__(self, token_type, name, start, end):
75        self.token_type = token_type
76        self.name = name
77        self.start = start
78        self.end = end
79        self.whence = WHENCE_STREAM
80
81    def __str__(self):
82        if not utils.DEBUG:
83            return 'Token(%r)' % self.name
84        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
85
86    __repr__ = __str__
87
88
89def _GetString(source, start, i):
90    i = source.find('"', i+1)
91    while source[i-1] == '\\':
92        # Count the trailing backslashes.
93        backslash_count = 1
94        j = i - 2
95        while source[j] == '\\':
96            backslash_count += 1
97            j -= 1
98        # When trailing backslashes are even, they escape each other.
99        if (backslash_count % 2) == 0:
100            break
101        i = source.find('"', i+1)
102    return i + 1
103
104
105def _GetChar(source, start, i):
106    # NOTE(nnorwitz): may not be quite correct, should be good enough.
107    i = source.find("'", i+1)
108    while source[i-1] == '\\':
109        # Need to special case '\\'.
110        if (i - 2) > start and source[i-2] == '\\':
111            break
112        i = source.find("'", i+1)
113    # Try to handle unterminated single quotes (in a #if 0 block).
114    if i < 0:
115        i = start
116    return i + 1
117
118
119def GetTokens(source):
120    """Returns a sequence of Tokens.
121
122    Args:
123      source: string of C++ source code.
124
125    Yields:
126      Token that represents the next token in the source.
127    """
128    # Cache various valid character sets for speed.
129    valid_identifier_chars = VALID_IDENTIFIER_CHARS
130    hex_digits = HEX_DIGITS
131    int_or_float_digits = INT_OR_FLOAT_DIGITS
132    int_or_float_digits2 = int_or_float_digits | set('.')
133
134    # Only ignore errors while in a #if 0 block.
135    ignore_errors = False
136    count_ifs = 0
137
138    i = 0
139    end = len(source)
140    while i < end:
141        # Skip whitespace.
142        while i < end and source[i].isspace():
143            i += 1
144        if i >= end:
145            return
146
147        token_type = UNKNOWN
148        start = i
149        c = source[i]
150        if c.isalpha() or c == '_':              # Find a string token.
151            token_type = NAME
152            while source[i] in valid_identifier_chars:
153                i += 1
154            # String and character constants can look like a name if
155            # they are something like L"".
156            if (source[i] == "'" and (i - start) == 1 and
157                source[start:i] in 'uUL'):
158                # u, U, and L are valid C++0x character preffixes.
159                token_type = CONSTANT
160                i = _GetChar(source, start, i)
161            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162                token_type = CONSTANT
163                i = _GetString(source, start, i)
164        elif c == '/' and source[i+1] == '/':    # Find // comments.
165            i = source.find('\n', i)
166            if i == -1:  # Handle EOF.
167                i = end
168            continue
169        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
170            i = source.find('*/', i) + 2
171            continue
172        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
173            token_type = SYNTAX
174            i += 1
175            new_ch = source[i]
176            if new_ch == c:
177                i += 1
178            elif c == '-' and new_ch == '>':
179                i += 1
180            elif new_ch == '=':
181                i += 1
182        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
183            token_type = SYNTAX
184            i += 1
185            if c == '.' and source[i].isdigit():
186                token_type = CONSTANT
187                i += 1
188                while source[i] in int_or_float_digits:
189                    i += 1
190                # Handle float suffixes.
191                for suffix in ('l', 'f'):
192                    if suffix == source[i:i+1].lower():
193                        i += 1
194                        break
195        elif c.isdigit():                        # Find integer.
196            token_type = CONSTANT
197            if c == '0' and source[i+1] in 'xX':
198                # Handle hex digits.
199                i += 2
200                while source[i] in hex_digits:
201                    i += 1
202            else:
203                while source[i] in int_or_float_digits2:
204                    i += 1
205            # Handle integer (and float) suffixes.
206            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
207                size = len(suffix)
208                if suffix == source[i:i+size].lower():
209                    i += size
210                    break
211        elif c == '"':                           # Find string.
212            token_type = CONSTANT
213            i = _GetString(source, start, i)
214        elif c == "'":                           # Find char.
215            token_type = CONSTANT
216            i = _GetChar(source, start, i)
217        elif c == '#':                           # Find pre-processor command.
218            token_type = PREPROCESSOR
219            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
220            if got_if:
221                count_ifs += 1
222            elif source[i:i+6] == '#endif':
223                count_ifs -= 1
224                if count_ifs == 0:
225                    ignore_errors = False
226
227            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
228            while 1:
229                i1 = source.find('\n', i)
230                i2 = source.find('//', i)
231                i3 = source.find('/*', i)
232                i4 = source.find('"', i)
233                # NOTE(nnorwitz): doesn't handle comments in #define macros.
234                # Get the first important symbol (newline, comment, EOF/end).
235                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
236
237                # Handle #include "dir//foo.h" properly.
238                if source[i] == '"':
239                    i = source.find('"', i+1) + 1
240                    assert i > 0
241                    continue
242                # Keep going if end of the line and the line ends with \.
243                if not (i == i1 and source[i-1] == '\\'):
244                    if got_if:
245                        condition = source[start+4:i].lstrip()
246                        if (condition.startswith('0') or
247                            condition.startswith('(0)')):
248                            ignore_errors = True
249                    break
250                i += 1
251        elif c == '\\':                          # Handle \ in code.
252            # This is different from the pre-processor \ handling.
253            i += 1
254            continue
255        elif ignore_errors:
256            # The tokenizer seems to be in pretty good shape.  This
257            # raise is conditionally disabled so that bogus code
258            # in an #if 0 block can be handled.  Since we will ignore
259            # it anyways, this is probably fine.  So disable the
260            # exception and  return the bogus char.
261            i += 1
262        else:
263            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264                             ('?', i, c, source[i-10:i+10]))
265            raise RuntimeError('unexpected token')
266
267        if i <= 0:
268            print('Invalid index, exiting now.')
269            return
270        yield Token(token_type, source[start:i], start, i)
271
272
273if __name__ == '__main__':
274    def main(argv):
275        """Driver mostly for testing purposes."""
276        for filename in argv[1:]:
277            source = utils.ReadFile(filename)
278            if source is None:
279                continue
280
281            for token in GetTokens(source):
282                print('%-12s: %s' % (token.token_type, token.name))
283                # print('\r%6.2f%%' % (100.0 * index / token.end),)
284            sys.stdout.write('\n')
285
286
287    main(sys.argv)
288