idl_lexer.py revision 3240926e260ce088908e02ac07a6cf7b0c0cbf44
1#!/usr/bin/env python
2# Copyright (c) 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6""" Lexer for PPAPI IDL
7
8The lexer uses the PLY library to build a tokenizer which understands both
9WebIDL and Pepper tokens.
10
11WebIDL, and WebIDL regular expressions can be found at:
12   http://www.w3.org/TR/2012/CR-WebIDL-20120419/
13PLY can be found at:
14   http://www.dabeaz.com/ply/
15"""
16
17import optparse
18import os.path
19import sys
20
21#
22# Try to load the ply module, if not, then assume it is in the third_party
23# directory.
24#
25try:
26  # Disable lint check which fails to find the ply module.
27  # pylint: disable=F0401
28  from ply import lex
29except:
30  module_path, module_name = os.path.split(__file__)
31  third_party = os.path.join(module_path, '..', '..', 'third_party')
32  sys.path.append(third_party)
33  # pylint: disable=F0401
34  from ply import lex
35
36#
37# IDL Lexer
38#
39class IDLLexer(object):
40  # 'tokens' is a value required by lex which specifies the complete list
41  # of valid token types.
42  tokens = [
43    # Data types
44      'float',
45      'integer',
46      'string',
47
48    # Symbol and keywords types
49      'COMMENT',
50      'identifier',
51
52    # MultiChar operators
53      'ELLIPSIS',
54  ]
55
56  # 'keywords' is a map of string to token type.  All tokens matching
57  # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
58  # if the token is actually a keyword.
59  keywords = {
60    'any' : 'ANY',
61    'attribute' : 'ATTRIBUTE',
62    'boolean' : 'BOOLEAN',
63    'byte' : 'BYTE',
64    'callback' : 'CALLBACK',
65    'const' : 'CONST',
66    'creator' : 'CREATOR',
67    'Date' : 'DATE',
68    'deleter' : 'DELETER',
69    'dictionary' : 'DICTIONARY',
70    'DOMString' : 'DOMSTRING',
71    'double' : 'DOUBLE',
72    'enum'  : 'ENUM',
73    'false' : 'FALSE',
74    'float' : 'FLOAT',
75    'exception' : 'EXCEPTION',
76    'getter': 'GETTER',
77    'implements' : 'IMPLEMENTS',
78    'Infinity' : 'INFINITY',
79    'inherit' : 'INHERIT',
80    'interface' : 'INTERFACE',
81    'legacycaller' : 'LEGACYCALLER',
82    'long' : 'LONG',
83    'Nan' : 'NAN',
84    'null' : 'NULL',
85    'object' : 'OBJECT',
86    'octet' : 'OCTET',
87    'optional' : 'OPTIONAL',
88    'or' : 'OR',
89    'partial'  : 'PARTIAL',
90    'readonly' : 'READONLY',
91    'sequence' : 'SEQUENCE',
92    'setter': 'SETTER',
93    'short' : 'SHORT',
94    'static' : 'STATIC',
95    'stringifier' : 'STRINGIFIER',
96    'typedef' : 'TYPEDEF',
97    'true' : 'TRUE',
98    'unsigned' : 'UNSIGNED',
99    'unrestricted' : 'UNRESTRICTED',
100    'void' : 'VOID'
101  }
102
103  # Token definitions
104  #
105  # Lex assumes any value or function in the form of 't_<TYPE>' represents a
106  # regular expression where a match will emit a token of type <TYPE>.  In the
107  # case of a function, the function is called when a match is made. These
108  # definitions come from WebIDL.
109  def t_ELLIPSIS(self, t):
110    r'\.\.\.'
111    return t
112
113  def t_float(self, t):
114    r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
115    return t
116
117  def t_integer(self, t):
118    r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
119    return t
120
121
122  # A line ending '\n', we use this to increment the line number
123  def t_LINE_END(self, t):
124    r'\n+'
125    self.AddLines(len(t.value))
126
127  # We do not process escapes in the IDL strings.  Strings are exclusively
128  # used for attributes and enums, and not used as typical 'C' constants.
129  def t_string(self, t):
130    r'"[^"]*"'
131    t.value = t.value[1:-1]
132    self.AddLines(t.value.count('\n'))
133    return t
134
135  # A C or C++ style comment:  /* xxx */ or //
136  def t_COMMENT(self, t):
137    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
138    self.AddLines(t.value.count('\n'))
139    return t
140
141  # A symbol or keyword.
142  def t_KEYWORD_OR_SYMBOL(self, t):
143    r'_?[A-Za-z][A-Za-z_0-9]*'
144
145    # All non-keywords are assumed to be symbols
146    t.type = self.keywords.get(t.value, 'identifier')
147
148    # We strip leading underscores so that you can specify symbols with the same
149    # value as a keywords (E.g. a dictionary named 'interface').
150    if t.value[0] == '_':
151      t.value = t.value[1:]
152    return t
153
154  def t_ANY_error(self, t):
155    msg = 'Unrecognized input'
156    line = self.Lexer().lineno
157
158    # If that line has not been accounted for, then we must have hit
159    # EoF, so compute the beginning of the line that caused the problem.
160    if line >= len(self.index):
161      # Find the offset in the line of the first word causing the issue
162      word = t.value.split()[0]
163      offs = self.lines[line - 1].find(word)
164      # Add the computed line's starting position
165      self.index.append(self.Lexer().lexpos - offs)
166      msg = 'Unexpected EoF reached after'
167
168    pos = self.Lexer().lexpos - self.index[line]
169    out = self.ErrorMessage(line, pos, msg)
170    sys.stderr.write(out + '\n')
171    self._lex_errors += 1
172
173
174  def AddLines(self, count):
175    # Set the lexer position for the beginning of the next line.  In the case
176    # of multiple lines, tokens can not exist on any of the lines except the
177    # last one, so the recorded value for previous lines are unused.  We still
178    # fill the array however, to make sure the line count is correct.
179    self.Lexer().lineno += count
180    for _ in range(count):
181      self.index.append(self.Lexer().lexpos)
182
183  def FileLineMsg(self, line, msg):
184    # Generate a message containing the file and line number of a token.
185    filename = self.Lexer().filename
186    if filename:
187      return "%s(%d) : %s" % (filename, line + 1, msg)
188    return "<BuiltIn> : %s" % msg
189
190  def SourceLine(self, line, pos):
191    # Create a source line marker
192    caret = ' ' * pos + '^'
193    # We decrement the line number since the array is 0 based while the
194    # line numbers are 1 based.
195    return "%s\n%s" % (self.lines[line - 1], caret)
196
197  def ErrorMessage(self, line, pos, msg):
198    return "\n%s\n%s" % (
199        self.FileLineMsg(line, msg),
200        self.SourceLine(line, pos))
201
202#
203# Tokenizer
204#
205# The token function returns the next token provided by IDLLexer for matching
206# against the leaf paterns.
207#
208  def token(self):
209    tok = self.Lexer().token()
210    if tok:
211      self.last = tok
212    return tok
213
214
215  def GetTokens(self):
216    outlist = []
217    while True:
218      t = self.Lexer().token()
219      if not t:
220        break
221      outlist.append(t)
222    return outlist
223
224  def Tokenize(self, data, filename='__no_file__'):
225    lexer = self.Lexer()
226    lexer.lineno = 1
227    lexer.filename = filename
228    lexer.input(data)
229    self.lines = data.split('\n')
230
231  def KnownTokens(self):
232    return self.tokens
233
234  def Lexer(self):
235    if not self._lexobj:
236      self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
237    return self._lexobj
238
239  def _AddConstDefs(self):
240    # 'literals' is a value expected by lex which specifies a list of valid
241    # literal tokens, meaning the token type and token value are identical.
242    self.literals = r'"*.(){}[],;:=+-/~|&^?<>'
243    self.t_ignore = ' \t'
244
245  def _AddToken(self, token):
246    if token in self.tokens:
247      raise RuntimeError('Same token: ' + token)
248    self.tokens.append(token)
249
250  def _AddTokens(self, tokens):
251    for token in tokens:
252      self._AddToken(token)
253
254  def _AddKeywords(self, keywords):
255    for key in keywords:
256      value = key.upper()
257      self._AddToken(value)
258      self.keywords[key] = value
259
260  def _DelKeywords(self, keywords):
261    for key in keywords:
262      self.tokens.remove(key.upper())
263      del self.keywords[key]
264
265  def __init__(self):
266    self.index = [0]
267    self._lex_errors = 0
268    self.linex = []
269    self.filename = None
270    self.keywords = {}
271    self.tokens = []
272    self._AddConstDefs()
273    self._AddTokens(IDLLexer.tokens)
274    self._AddKeywords(IDLLexer.keywords)
275    self._lexobj = None
276
277# If run by itself, attempt to build the lexer
278if __name__ == '__main__':
279  lexer = IDLLexer()
280