idl_lexer.py revision 3240926e260ce088908e02ac07a6cf7b0c0cbf44
1#!/usr/bin/env python 2# Copyright (c) 2013 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6""" Lexer for PPAPI IDL 7 8The lexer uses the PLY library to build a tokenizer which understands both 9WebIDL and Pepper tokens. 10 11WebIDL, and WebIDL regular expressions can be found at: 12 http://www.w3.org/TR/2012/CR-WebIDL-20120419/ 13PLY can be found at: 14 http://www.dabeaz.com/ply/ 15""" 16 17import optparse 18import os.path 19import sys 20 21# 22# Try to load the ply module, if not, then assume it is in the third_party 23# directory. 24# 25try: 26 # Disable lint check which fails to find the ply module. 27 # pylint: disable=F0401 28 from ply import lex 29except: 30 module_path, module_name = os.path.split(__file__) 31 third_party = os.path.join(module_path, '..', '..', 'third_party') 32 sys.path.append(third_party) 33 # pylint: disable=F0401 34 from ply import lex 35 36# 37# IDL Lexer 38# 39class IDLLexer(object): 40 # 'tokens' is a value required by lex which specifies the complete list 41 # of valid token types. 42 tokens = [ 43 # Data types 44 'float', 45 'integer', 46 'string', 47 48 # Symbol and keywords types 49 'COMMENT', 50 'identifier', 51 52 # MultiChar operators 53 'ELLIPSIS', 54 ] 55 56 # 'keywords' is a map of string to token type. All tokens matching 57 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine 58 # if the token is actually a keyword. 59 keywords = { 60 'any' : 'ANY', 61 'attribute' : 'ATTRIBUTE', 62 'boolean' : 'BOOLEAN', 63 'byte' : 'BYTE', 64 'callback' : 'CALLBACK', 65 'const' : 'CONST', 66 'creator' : 'CREATOR', 67 'Date' : 'DATE', 68 'deleter' : 'DELETER', 69 'dictionary' : 'DICTIONARY', 70 'DOMString' : 'DOMSTRING', 71 'double' : 'DOUBLE', 72 'enum' : 'ENUM', 73 'false' : 'FALSE', 74 'float' : 'FLOAT', 75 'exception' : 'EXCEPTION', 76 'getter': 'GETTER', 77 'implements' : 'IMPLEMENTS', 78 'Infinity' : 'INFINITY', 79 'inherit' : 'INHERIT', 80 'interface' : 'INTERFACE', 81 'legacycaller' : 'LEGACYCALLER', 82 'long' : 'LONG', 83 'Nan' : 'NAN', 84 'null' : 'NULL', 85 'object' : 'OBJECT', 86 'octet' : 'OCTET', 87 'optional' : 'OPTIONAL', 88 'or' : 'OR', 89 'partial' : 'PARTIAL', 90 'readonly' : 'READONLY', 91 'sequence' : 'SEQUENCE', 92 'setter': 'SETTER', 93 'short' : 'SHORT', 94 'static' : 'STATIC', 95 'stringifier' : 'STRINGIFIER', 96 'typedef' : 'TYPEDEF', 97 'true' : 'TRUE', 98 'unsigned' : 'UNSIGNED', 99 'unrestricted' : 'UNRESTRICTED', 100 'void' : 'VOID' 101 } 102 103 # Token definitions 104 # 105 # Lex assumes any value or function in the form of 't_<TYPE>' represents a 106 # regular expression where a match will emit a token of type <TYPE>. In the 107 # case of a function, the function is called when a match is made. These 108 # definitions come from WebIDL. 109 def t_ELLIPSIS(self, t): 110 r'\.\.\.' 111 return t 112 113 def t_float(self, t): 114 r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)' 115 return t 116 117 def t_integer(self, t): 118 r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)' 119 return t 120 121 122 # A line ending '\n', we use this to increment the line number 123 def t_LINE_END(self, t): 124 r'\n+' 125 self.AddLines(len(t.value)) 126 127 # We do not process escapes in the IDL strings. Strings are exclusively 128 # used for attributes and enums, and not used as typical 'C' constants. 129 def t_string(self, t): 130 r'"[^"]*"' 131 t.value = t.value[1:-1] 132 self.AddLines(t.value.count('\n')) 133 return t 134 135 # A C or C++ style comment: /* xxx */ or // 136 def t_COMMENT(self, t): 137 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 138 self.AddLines(t.value.count('\n')) 139 return t 140 141 # A symbol or keyword. 142 def t_KEYWORD_OR_SYMBOL(self, t): 143 r'_?[A-Za-z][A-Za-z_0-9]*' 144 145 # All non-keywords are assumed to be symbols 146 t.type = self.keywords.get(t.value, 'identifier') 147 148 # We strip leading underscores so that you can specify symbols with the same 149 # value as a keywords (E.g. a dictionary named 'interface'). 150 if t.value[0] == '_': 151 t.value = t.value[1:] 152 return t 153 154 def t_ANY_error(self, t): 155 msg = 'Unrecognized input' 156 line = self.Lexer().lineno 157 158 # If that line has not been accounted for, then we must have hit 159 # EoF, so compute the beginning of the line that caused the problem. 160 if line >= len(self.index): 161 # Find the offset in the line of the first word causing the issue 162 word = t.value.split()[0] 163 offs = self.lines[line - 1].find(word) 164 # Add the computed line's starting position 165 self.index.append(self.Lexer().lexpos - offs) 166 msg = 'Unexpected EoF reached after' 167 168 pos = self.Lexer().lexpos - self.index[line] 169 out = self.ErrorMessage(line, pos, msg) 170 sys.stderr.write(out + '\n') 171 self._lex_errors += 1 172 173 174 def AddLines(self, count): 175 # Set the lexer position for the beginning of the next line. In the case 176 # of multiple lines, tokens can not exist on any of the lines except the 177 # last one, so the recorded value for previous lines are unused. We still 178 # fill the array however, to make sure the line count is correct. 179 self.Lexer().lineno += count 180 for _ in range(count): 181 self.index.append(self.Lexer().lexpos) 182 183 def FileLineMsg(self, line, msg): 184 # Generate a message containing the file and line number of a token. 185 filename = self.Lexer().filename 186 if filename: 187 return "%s(%d) : %s" % (filename, line + 1, msg) 188 return "<BuiltIn> : %s" % msg 189 190 def SourceLine(self, line, pos): 191 # Create a source line marker 192 caret = ' ' * pos + '^' 193 # We decrement the line number since the array is 0 based while the 194 # line numbers are 1 based. 195 return "%s\n%s" % (self.lines[line - 1], caret) 196 197 def ErrorMessage(self, line, pos, msg): 198 return "\n%s\n%s" % ( 199 self.FileLineMsg(line, msg), 200 self.SourceLine(line, pos)) 201 202# 203# Tokenizer 204# 205# The token function returns the next token provided by IDLLexer for matching 206# against the leaf paterns. 207# 208 def token(self): 209 tok = self.Lexer().token() 210 if tok: 211 self.last = tok 212 return tok 213 214 215 def GetTokens(self): 216 outlist = [] 217 while True: 218 t = self.Lexer().token() 219 if not t: 220 break 221 outlist.append(t) 222 return outlist 223 224 def Tokenize(self, data, filename='__no_file__'): 225 lexer = self.Lexer() 226 lexer.lineno = 1 227 lexer.filename = filename 228 lexer.input(data) 229 self.lines = data.split('\n') 230 231 def KnownTokens(self): 232 return self.tokens 233 234 def Lexer(self): 235 if not self._lexobj: 236 self._lexobj = lex.lex(object=self, lextab=None, optimize=0) 237 return self._lexobj 238 239 def _AddConstDefs(self): 240 # 'literals' is a value expected by lex which specifies a list of valid 241 # literal tokens, meaning the token type and token value are identical. 242 self.literals = r'"*.(){}[],;:=+-/~|&^?<>' 243 self.t_ignore = ' \t' 244 245 def _AddToken(self, token): 246 if token in self.tokens: 247 raise RuntimeError('Same token: ' + token) 248 self.tokens.append(token) 249 250 def _AddTokens(self, tokens): 251 for token in tokens: 252 self._AddToken(token) 253 254 def _AddKeywords(self, keywords): 255 for key in keywords: 256 value = key.upper() 257 self._AddToken(value) 258 self.keywords[key] = value 259 260 def _DelKeywords(self, keywords): 261 for key in keywords: 262 self.tokens.remove(key.upper()) 263 del self.keywords[key] 264 265 def __init__(self): 266 self.index = [0] 267 self._lex_errors = 0 268 self.linex = [] 269 self.filename = None 270 self.keywords = {} 271 self.tokens = [] 272 self._AddConstDefs() 273 self._AddTokens(IDLLexer.tokens) 274 self._AddKeywords(IDLLexer.keywords) 275 self._lexobj = None 276 277# If run by itself, attempt to build the lexer 278if __name__ == '__main__': 279 lexer = IDLLexer() 280