idl_lexer.py revision a1401311d1ab56c4ed0a474bd38c108f75cb0cd9
1#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6""" Lexer for PPAPI IDL """
7
8#
9# IDL Lexer
10#
11# The lexer is uses the PLY lex library to build a tokenizer which understands
12# WebIDL tokens.
13#
14# WebIDL, and WebIDL regular expressions can be found at:
15#   http://dev.w3.org/2006/webapi/WebIDL/
16# PLY can be found at:
17#   http://www.dabeaz.com/ply/
18
19import os.path
20import re
21import sys
22
23#
24# Try to load the ply module, if not, then assume it is in the third_party
25# directory, relative to ppapi
26#
27try:
28  from ply import lex
29except:
30  module_path, module_name = os.path.split(__file__)
31  third_party = os.path.join(module_path, '..', '..', 'third_party')
32  sys.path.append(third_party)
33  from ply import lex
34
35from idl_option import GetOption, Option, ParseOptions
36
37
38Option('output', 'Generate output.')
39
40#
41# IDL Lexer
42#
43class IDLLexer(object):
44  # 'tokens' is a value required by lex which specifies the complete list
45  # of valid token types.
46  tokens = [
47    # Symbol and keywords types
48      'COMMENT',
49      'DESCRIBE',
50      'ENUM',
51      'LABEL',
52      'SYMBOL',
53      'INLINE',
54      'INTERFACE',
55      'STRUCT',
56      'TYPEDEF',
57      'OR',
58
59    # Extra WebIDL keywords
60      'CALLBACK',
61      'DICTIONARY',
62      'OPTIONAL',
63      'STATIC',
64
65    # Invented for apps use
66      'NAMESPACE',
67
68    # Data types
69      'FLOAT',
70      'OCT',
71      'INT',
72      'HEX',
73      'STRING',
74
75    # Operators
76      'LSHIFT',
77      'RSHIFT'
78  ]
79
80  # 'keywords' is a map of string to token type.  All SYMBOL tokens are
81  # matched against keywords, to determine if the token is actually a keyword.
82  keywords = {
83    'describe' : 'DESCRIBE',
84    'enum'  : 'ENUM',
85    'label' : 'LABEL',
86    'interface' : 'INTERFACE',
87    'readonly' : 'READONLY',
88    'struct' : 'STRUCT',
89    'typedef' : 'TYPEDEF',
90
91    'callback' : 'CALLBACK',
92    'dictionary' : 'DICTIONARY',
93    'optional' : 'OPTIONAL',
94    'static' : 'STATIC',
95    'namespace' : 'NAMESPACE',
96
97    'or' : 'OR',
98  }
99
100  # 'literals' is a value expected by lex which specifies a list of valid
101  # literal tokens, meaning the token type and token value are identical.
102  literals = '"*.(){}[],;:=+-/~|&^?'
103
104  # Token definitions
105  #
106  # Lex assumes any value or function in the form of 't_<TYPE>' represents a
107  # regular expression where a match will emit a token of type <TYPE>.  In the
108  # case of a function, the function is called when a match is made. These
109  # definitions come from WebIDL.
110
111  # 't_ignore' is a special match of items to ignore
112  t_ignore = ' \t'
113
114  # Constant values
115  t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
116  t_INT = r'-?[0-9]+[uU]?'
117  t_OCT = r'-?0[0-7]+'
118  t_HEX = r'-?0[Xx][0-9A-Fa-f]+'
119  t_LSHIFT = r'<<'
120  t_RSHIFT = r'>>'
121
122  # A line ending '\n', we use this to increment the line number
123  def t_LINE_END(self, t):
124    r'\n+'
125    self.AddLines(len(t.value))
126
127  # We do not process escapes in the IDL strings.  Strings are exclusively
128  # used for attributes, and not used as typical 'C' constants.
129  def t_STRING(self, t):
130    r'"[^"]*"'
131    t.value = t.value[1:-1]
132    self.AddLines(t.value.count('\n'))
133    return t
134
135  # A C or C++ style comment:  /* xxx */ or //
136  def t_COMMENT(self, t):
137    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
138    self.AddLines(t.value.count('\n'))
139    return t
140
141  # Return a "preprocessor" inline block
142  def t_INLINE(self, t):
143    r'\#inline (.|\n)*?\#endinl.*'
144    self.AddLines(t.value.count('\n'))
145    return t
146
147  # A symbol or keyword.
148  def t_KEYWORD_SYMBOL(self, t):
149    r'_?[A-Za-z][A-Za-z_0-9]*'
150
151    # All non-keywords are assumed to be symbols
152    t.type = self.keywords.get(t.value, 'SYMBOL')
153
154    # We strip leading underscores so that you can specify symbols with the same
155    # value as a keywords (E.g. a dictionary named 'interface').
156    if t.value[0] == '_':
157      t.value = t.value[1:]
158    return t
159
160  def t_ANY_error(self, t):
161    msg = "Unrecognized input"
162    line = self.lexobj.lineno
163
164    # If that line has not been accounted for, then we must have hit
165    # EoF, so compute the beginning of the line that caused the problem.
166    if line >= len(self.index):
167      # Find the offset in the line of the first word causing the issue
168      word = t.value.split()[0]
169      offs = self.lines[line - 1].find(word)
170      # Add the computed line's starting position
171      self.index.append(self.lexobj.lexpos - offs)
172      msg = "Unexpected EoF reached after"
173
174    pos = self.lexobj.lexpos - self.index[line]
175    file = self.lexobj.filename
176    out = self.ErrorMessage(file, line, pos, msg)
177    sys.stderr.write(out + '\n')
178    self.lex_errors += 1
179
180
181  def AddLines(self, count):
182    # Set the lexer position for the beginning of the next line.  In the case
183    # of multiple lines, tokens can not exist on any of the lines except the
184    # last one, so the recorded value for previous lines are unused.  We still
185    # fill the array however, to make sure the line count is correct.
186    self.lexobj.lineno += count
187    for i in range(count):
188      self.index.append(self.lexobj.lexpos)
189
190  def FileLineMsg(self, file, line, msg):
191    if file:  return "%s(%d) : %s" % (file, line + 1, msg)
192    return "<BuiltIn> : %s" % msg
193
194  def SourceLine(self, file, line, pos):
195    caret = '\t^'.expandtabs(pos)
196    # We decrement the line number since the array is 0 based while the
197    # line numbers are 1 based.
198    return "%s\n%s" % (self.lines[line - 1], caret)
199
200  def ErrorMessage(self, file, line, pos, msg):
201    return "\n%s\n%s" % (
202        self.FileLineMsg(file, line, msg),
203        self.SourceLine(file, line, pos))
204
205  def SetData(self, filename, data):
206    # Start with line 1, not zero
207    self.lexobj.lineno = 1
208    self.lexobj.filename = filename
209    self.lines = data.split('\n')
210    self.index = [0]
211    self.lexobj.input(data)
212    self.lex_errors = 0
213
214  def __init__(self):
215    self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
216
217
218
219#
220# FilesToTokens
221#
222# From a set of source file names, generate a list of tokens.
223#
224def FilesToTokens(filenames, verbose=False):
225  lexer = IDLLexer()
226  outlist = []
227  for filename in filenames:
228    data = open(filename).read()
229    lexer.SetData(filename, data)
230    if verbose: sys.stdout.write('  Loaded %s...\n' % filename)
231    while 1:
232      t = lexer.lexobj.token()
233      if t is None: break
234      outlist.append(t)
235  return outlist
236
237
238def TokensFromText(text):
239  lexer = IDLLexer()
240  lexer.SetData('unknown', text)
241  outlist = []
242  while 1:
243    t = lexer.lexobj.token()
244    if t is None: break
245    outlist.append(t.value)
246  return outlist
247
248#
249# TextToTokens
250#
251# From a block of text, generate a list of tokens
252#
253def TextToTokens(source):
254  lexer = IDLLexer()
255  outlist = []
256  lexer.SetData('AUTO', source)
257  while 1:
258    t = lexer.lexobj.token()
259    if t is None: break
260    outlist.append(t.value)
261  return outlist
262
263
264#
265# TestSame
266#
267# From a set of token values, generate a new source text by joining with a
268# single space.  The new source is then tokenized and compared against the
269# old set.
270#
271def TestSame(values1):
272  # Recreate the source from the tokens.  We use newline instead of whitespace
273  # since the '//' and #inline regex are line sensitive.
274  text = '\n'.join(values1)
275  values2 = TextToTokens(text)
276
277  count1 = len(values1)
278  count2 = len(values2)
279  if count1 != count2:
280    print "Size mismatch original %d vs %d\n" % (count1, count2)
281    if count1 > count2: count1 = count2
282
283  for i in range(count1):
284    if values1[i] != values2[i]:
285      print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i])
286
287  if GetOption('output'):
288    sys.stdout.write('Generating original.txt and tokenized.txt\n')
289    open('original.txt', 'w').write(src1)
290    open('tokenized.txt', 'w').write(src2)
291
292  if values1 == values2:
293    sys.stdout.write('Same: Pass\n')
294    return 0
295
296  print "****************\n%s\n%s***************\n" % (src1, src2)
297  sys.stdout.write('Same: Failed\n')
298  return -1
299
300
301#
302# TestExpect
303#
304# From a set of tokens pairs, verify the type field of the second matches
305# the value of the first, so that:
306# INT 123 FLOAT 1.1
307# will generate a passing test, where the first token is the SYMBOL INT,
308# and the second token is the INT 123, third token is the SYMBOL FLOAT and
309# the fourth is the FLOAT 1.1, etc...
310def TestExpect(tokens):
311  count = len(tokens)
312  index = 0
313  errors = 0
314  while index < count:
315    type = tokens[index].value
316    token = tokens[index + 1]
317    index += 2
318
319    if type != token.type:
320      sys.stderr.write('Mismatch:  Expected %s, but got %s = %s.\n' %
321                       (type, token.type, token.value))
322      errors += 1
323
324  if not errors:
325    sys.stdout.write('Expect: Pass\n')
326    return 0
327
328  sys.stdout.write('Expect: Failed\n')
329  return -1
330
331
332def Main(args):
333  filenames = ParseOptions(args)
334
335  try:
336    tokens = FilesToTokens(filenames, GetOption('verbose'))
337    values = [tok.value for tok in tokens]
338    if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n')
339    if GetOption('test'):
340      if TestSame(values):
341        return -1
342      if TestExpect(tokens):
343        return -1
344    return 0
345
346  except lex.LexError as le:
347    sys.stderr.write('%s\n' % str(le))
348  return -1
349
350
351if __name__ == '__main__':
352  sys.exit(Main(sys.argv[1:]))
353