idl_lexer.py revision 5821806d5e7f356e8fa4b058a389a808ea183019
1#!/usr/bin/env python 2# Copyright (c) 2012 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6""" Lexer for PPAPI IDL """ 7 8# 9# IDL Lexer 10# 11# The lexer is uses the PLY lex library to build a tokenizer which understands 12# WebIDL tokens. 13# 14# WebIDL, and WebIDL regular expressions can be found at: 15# http://dev.w3.org/2006/webapi/WebIDL/ 16# PLY can be found at: 17# http://www.dabeaz.com/ply/ 18 19import os.path 20import re 21import sys 22 23# 24# Try to load the ply module, if not, then assume it is in the third_party 25# directory, relative to ppapi 26# 27try: 28 from ply import lex 29except: 30 module_path, module_name = os.path.split(__file__) 31 third_party = os.path.join(module_path, '..', '..', 'third_party') 32 sys.path.append(third_party) 33 from ply import lex 34 35from idl_option import GetOption, Option, ParseOptions 36 37 38Option('output', 'Generate output.') 39 40# 41# IDL Lexer 42# 43class IDLLexer(object): 44 # 'tokens' is a value required by lex which specifies the complete list 45 # of valid token types. 46 tokens = [ 47 # Symbol and keywords types 48 'COMMENT', 49 'DESCRIBE', 50 'ENUM', 51 'LABEL', 52 'SYMBOL', 53 'INLINE', 54 'INTERFACE', 55 'STRUCT', 56 'TYPEDEF', 57 58 # Extra WebIDL keywords 59 'CALLBACK', 60 'DICTIONARY', 61 'OPTIONAL', 62 'STATIC', 63 64 # Invented for apps use 65 'NAMESPACE', 66 67 68 # Data types 69 'FLOAT', 70 'OCT', 71 'INT', 72 'HEX', 73 'STRING', 74 75 # Operators 76 'LSHIFT', 77 'RSHIFT' 78 ] 79 80 # 'keywords' is a map of string to token type. All SYMBOL tokens are 81 # matched against keywords, to determine if the token is actually a keyword. 82 keywords = { 83 'describe' : 'DESCRIBE', 84 'enum' : 'ENUM', 85 'label' : 'LABEL', 86 'interface' : 'INTERFACE', 87 'readonly' : 'READONLY', 88 'struct' : 'STRUCT', 89 'typedef' : 'TYPEDEF', 90 91 'callback' : 'CALLBACK', 92 'dictionary' : 'DICTIONARY', 93 'optional' : 'OPTIONAL', 94 'static' : 'STATIC', 95 'namespace' : 'NAMESPACE', 96 } 97 98 # 'literals' is a value expected by lex which specifies a list of valid 99 # literal tokens, meaning the token type and token value are identical. 100 literals = '"*.(){}[],;:=+-/~|&^?' 101 102 # Token definitions 103 # 104 # Lex assumes any value or function in the form of 't_<TYPE>' represents a 105 # regular expression where a match will emit a token of type <TYPE>. In the 106 # case of a function, the function is called when a match is made. These 107 # definitions come from WebIDL. 108 109 # 't_ignore' is a special match of items to ignore 110 t_ignore = ' \t' 111 112 # Constant values 113 t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+' 114 t_INT = r'-?[0-9]+[uU]?' 115 t_OCT = r'-?0[0-7]+' 116 t_HEX = r'-?0[Xx][0-9A-Fa-f]+' 117 t_LSHIFT = r'<<' 118 t_RSHIFT = r'>>' 119 120 # A line ending '\n', we use this to increment the line number 121 def t_LINE_END(self, t): 122 r'\n+' 123 self.AddLines(len(t.value)) 124 125 # We do not process escapes in the IDL strings. Strings are exclusively 126 # used for attributes, and not used as typical 'C' constants. 127 def t_STRING(self, t): 128 r'"[^"]*"' 129 t.value = t.value[1:-1] 130 self.AddLines(t.value.count('\n')) 131 return t 132 133 # A C or C++ style comment: /* xxx */ or // 134 def t_COMMENT(self, t): 135 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' 136 self.AddLines(t.value.count('\n')) 137 return t 138 139 # Return a "preprocessor" inline block 140 def t_INLINE(self, t): 141 r'\#inline (.|\n)*?\#endinl.*' 142 self.AddLines(t.value.count('\n')) 143 return t 144 145 # A symbol or keyword. 146 def t_KEYWORD_SYMBOL(self, t): 147 r'[A-Za-z][A-Za-z_0-9]*' 148 149 #All non-keywords are assumed to be symbols 150 t.type = self.keywords.get(t.value, 'SYMBOL') 151 return t 152 153 def t_ANY_error(self, t): 154 msg = "Unrecognized input" 155 line = self.lexobj.lineno 156 157 # If that line has not been accounted for, then we must have hit 158 # EoF, so compute the beginning of the line that caused the problem. 159 if line >= len(self.index): 160 # Find the offset in the line of the first word causing the issue 161 word = t.value.split()[0] 162 offs = self.lines[line - 1].find(word) 163 # Add the computed line's starting position 164 self.index.append(self.lexobj.lexpos - offs) 165 msg = "Unexpected EoF reached after" 166 167 pos = self.lexobj.lexpos - self.index[line] 168 file = self.lexobj.filename 169 out = self.ErrorMessage(file, line, pos, msg) 170 sys.stderr.write(out + '\n') 171 self.lex_errors += 1 172 173 174 def AddLines(self, count): 175 # Set the lexer position for the beginning of the next line. In the case 176 # of multiple lines, tokens can not exist on any of the lines except the 177 # last one, so the recorded value for previous lines are unused. We still 178 # fill the array however, to make sure the line count is correct. 179 self.lexobj.lineno += count 180 for i in range(count): 181 self.index.append(self.lexobj.lexpos) 182 183 def FileLineMsg(self, file, line, msg): 184 if file: return "%s(%d) : %s" % (file, line + 1, msg) 185 return "<BuiltIn> : %s" % msg 186 187 def SourceLine(self, file, line, pos): 188 caret = '\t^'.expandtabs(pos) 189 # We decrement the line number since the array is 0 based while the 190 # line numbers are 1 based. 191 return "%s\n%s" % (self.lines[line - 1], caret) 192 193 def ErrorMessage(self, file, line, pos, msg): 194 return "\n%s\n%s" % ( 195 self.FileLineMsg(file, line, msg), 196 self.SourceLine(file, line, pos)) 197 198 def SetData(self, filename, data): 199 # Start with line 1, not zero 200 self.lexobj.lineno = 1 201 self.lexobj.filename = filename 202 self.lines = data.split('\n') 203 self.index = [0] 204 self.lexobj.input(data) 205 self.lex_errors = 0 206 207 def __init__(self): 208 self.lexobj = lex.lex(object=self, lextab=None, optimize=0) 209 210 211 212# 213# FilesToTokens 214# 215# From a set of source file names, generate a list of tokens. 216# 217def FilesToTokens(filenames, verbose=False): 218 lexer = IDLLexer() 219 outlist = [] 220 for filename in filenames: 221 data = open(filename).read() 222 lexer.SetData(filename, data) 223 if verbose: sys.stdout.write(' Loaded %s...\n' % filename) 224 while 1: 225 t = lexer.lexobj.token() 226 if t is None: break 227 outlist.append(t) 228 return outlist 229 230 231def TokensFromText(text): 232 lexer = IDLLexer() 233 lexer.SetData('unknown', text) 234 outlist = [] 235 while 1: 236 t = lexer.lexobj.token() 237 if t is None: break 238 outlist.append(t.value) 239 return outlist 240 241# 242# TextToTokens 243# 244# From a block of text, generate a list of tokens 245# 246def TextToTokens(source): 247 lexer = IDLLexer() 248 outlist = [] 249 lexer.SetData('AUTO', source) 250 while 1: 251 t = lexer.lexobj.token() 252 if t is None: break 253 outlist.append(t.value) 254 return outlist 255 256 257# 258# TestSame 259# 260# From a set of token values, generate a new source text by joining with a 261# single space. The new source is then tokenized and compared against the 262# old set. 263# 264def TestSame(values1): 265 # Recreate the source from the tokens. We use newline instead of whitespace 266 # since the '//' and #inline regex are line sensitive. 267 text = '\n'.join(values1) 268 values2 = TextToTokens(text) 269 270 count1 = len(values1) 271 count2 = len(values2) 272 if count1 != count2: 273 print "Size mismatch original %d vs %d\n" % (count1, count2) 274 if count1 > count2: count1 = count2 275 276 for i in range(count1): 277 if values1[i] != values2[i]: 278 print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i]) 279 280 if GetOption('output'): 281 sys.stdout.write('Generating original.txt and tokenized.txt\n') 282 open('original.txt', 'w').write(src1) 283 open('tokenized.txt', 'w').write(src2) 284 285 if values1 == values2: 286 sys.stdout.write('Same: Pass\n') 287 return 0 288 289 print "****************\n%s\n%s***************\n" % (src1, src2) 290 sys.stdout.write('Same: Failed\n') 291 return -1 292 293 294# 295# TestExpect 296# 297# From a set of tokens pairs, verify the type field of the second matches 298# the value of the first, so that: 299# INT 123 FLOAT 1.1 300# will generate a passing test, where the first token is the SYMBOL INT, 301# and the second token is the INT 123, third token is the SYMBOL FLOAT and 302# the fourth is the FLOAT 1.1, etc... 303def TestExpect(tokens): 304 count = len(tokens) 305 index = 0 306 errors = 0 307 while index < count: 308 type = tokens[index].value 309 token = tokens[index + 1] 310 index += 2 311 312 if type != token.type: 313 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' % 314 (type, token.type, token.value)) 315 errors += 1 316 317 if not errors: 318 sys.stdout.write('Expect: Pass\n') 319 return 0 320 321 sys.stdout.write('Expect: Failed\n') 322 return -1 323 324 325def Main(args): 326 filenames = ParseOptions(args) 327 328 try: 329 tokens = FilesToTokens(filenames, GetOption('verbose')) 330 values = [tok.value for tok in tokens] 331 if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n') 332 if GetOption('test'): 333 if TestSame(values): 334 return -1 335 if TestExpect(tokens): 336 return -1 337 return 0 338 339 except lex.LexError as le: 340 sys.stderr.write('%s\n' % str(le)) 341 return -1 342 343 344if __name__ == '__main__': 345 sys.exit(Main(sys.argv[1:])) 346