smaliLexer.flex revision 35bfbe82f00e0946ca267b8634690b9aeb78ec16
1package org.jf.smali;
2
3import org.antlr.runtime.*;
4
5import static org.jf.smali.smaliParser.*;
6
7%%
8
9%public
10%class smaliFlexLexer
11%implements TokenSource
12%type Token
13%unicode
14%line
15%column
16%char
17
18%{
19    private StringBuffer sb = new StringBuffer();
20    private String stringOrCharError = null;
21    private int stringStartLine;
22    private int stringStartCol;
23    private int stringStartChar;
24
25    public Token nextToken() {
26        try {
27            return yylex();
28        }
29        catch (java.io.IOException e) {
30            System.err.println("shouldn't happen: " + e.getMessage());
31            return Token.EOF_TOKEN;
32        }
33    }
34
35    public void setLine(int line) {
36        this.yyline = line-1;
37    }
38
39    public void setColumn(int column) {
40        this.yycolumn = column;
41    }
42
43    public int getLine() {
44        return this.yyline+1;
45    }
46
47    public int getColumn() {
48        return this.yycolumn;
49    }
50
51    public String getSourceName() {
52        return "";
53    }
54
55    private Token newToken(int type, String text, boolean hidden) {
56        CommonToken token = new CommonToken(type, text);
57        if (hidden) {
58            token.setChannel(Token.HIDDEN_CHANNEL);
59        }
60
61        token.setStartIndex(yychar);
62        token.setStopIndex(yychar + yylength() - 1);
63        token.setLine(getLine());
64        token.setCharPositionInLine(getColumn());
65        return token;
66    }
67
68    private Token newToken(int type, String text) {
69        return newToken(type, text, false);
70    }
71
72    private Token newToken(int type, boolean hidden) {
73        return newToken(type, yytext(), hidden);
74    }
75
76    private Token newToken(int type) {
77        return newToken(type, yytext(), false);
78    }
79
80    private Token invalidToken(String message, String text) {
81        InvalidToken token = new InvalidToken(message, text);
82
83        token.setStartIndex(yychar);
84        token.setStopIndex(yychar + yylength() - 1);
85        token.setLine(getLine());
86        token.setCharPositionInLine(getColumn());
87
88        return token;
89    }
90
91    private Token invalidToken(String message) {
92        return invalidToken(message, yytext());
93    }
94
95    private void beginStringOrChar(int state) {
96        yybegin(state);
97        sb.setLength(0);
98        stringStartLine = getLine();
99        stringStartCol = getColumn();
100        stringStartChar = yychar;
101        stringOrCharError = null;
102    }
103
104    private Token endStringOrChar(int type) {
105        yybegin(YYINITIAL);
106
107        if (stringOrCharError != null) {
108            return invalidStringOrChar(stringOrCharError);
109        }
110
111        CommonToken token = new CommonToken(type, sb.toString());
112        token.setStartIndex(stringStartChar);
113        token.setStopIndex(yychar + yylength() - 1);
114        token.setLine(stringStartLine);
115        token.setCharPositionInLine(stringStartCol);
116        return token;
117    }
118
119    private void setStringOrCharError(String message) {
120        if (stringOrCharError == null) {
121            stringOrCharError = message;
122        }
123    }
124
125    private Token invalidStringOrChar(String message) {
126        yybegin(YYINITIAL);
127
128        InvalidToken token = new InvalidToken(message, sb.toString());
129        token.setStartIndex(stringStartChar);
130        token.setStopIndex(yychar + yylength() - 1);
131        token.setLine(stringStartLine);
132        token.setCharPositionInLine(stringStartCol);
133        return token;
134    }
135
136    public String getErrorHeader(RecognitionException e) {
137        return getSourceName()+"["+ e.line+","+e.charPositionInLine+"]";
138    }
139%}
140
141HexPrefix = 0 [xX]
142
143HexDigit = [0-9a-fA-F]
144HexDigits = [0-9a-fA-F]{4}
145FewerHexDigits = [0-9a-fA-F]{0,3}
146
147Integer1 = 0
148Integer2 = [1-9] [0-9]*
149Integer3 = 0 [0-7]+
150Integer4 = {HexPrefix} {HexDigit}+
151Integer = {Integer1} | {Integer2} | {Integer3} | {Integer4}
152
153DecimalExponent = [eE] -? [0-9]+
154
155BinaryExponent = [pP] -? [0-9]+
156
157/*This can either be a floating point number or an identifier*/
158FloatOrID1 = -? [0-9]+ {DecimalExponent}
159FloatOrID2 = -? {HexPrefix} {HexDigit}+ {BinaryExponent}
160FloatOrID3 = -? [iI][nN][fF][iI][nN][iI][tT][yY]
161FloatOrID4 = [nN][aA][nN]
162FloatOrID =  {FloatOrID1} | {FloatOrID2} | {FloatOrID3} | {FloatOrID4}
163
164
165/*This can only be a float and not an identifier, due to the decimal point*/
166Float1 = -? [0-9]+ "." [0-9]* {DecimalExponent}?
167Float2 = -? "." [0-9]+ {DecimalExponent}?
168Float3 = -? {HexPrefix} {HexDigit}+ "." {HexDigit}* {BinaryExponent}
169Float4 = -? {HexPrefix} "." {HexDigit}+ {BinaryExponent}
170Float =  {Float1} | {Float2} | {Float3} | {Float4}
171
172SimpleName = [A-Za-z0-9$\-_\u00a1-\u1fff\u2010-\u2027\u2030-\ud7ff\ue000-\uffef]+
173
174PrimitiveType = [ZBSCIJFD]
175
176ClassDescriptor = L ({SimpleName} "/")* {SimpleName} ;
177
178ArrayDescriptor = "[" + ({PrimitiveType} | {ClassDescriptor})
179
180Type = {PrimitiveType} | {ClassDescriptor} | {ArrayDescriptor}
181
182
183%state STRING
184%state CHAR
185
186%%
187
188/*Directives*/
189<YYINITIAL>
190{
191    ".class" { return newToken(CLASS_DIRECTIVE); }
192    ".super" { return newToken(SUPER_DIRECTIVE); }
193    ".implements" { return newToken(IMPLEMENTS_DIRECTIVE); }
194    ".source" { return newToken(SOURCE_DIRECTIVE); }
195    ".field" { return newToken(FIELD_DIRECTIVE); }
196    ".end field" { return newToken(END_FIELD_DIRECTIVE); }
197    ".subannotation" { return newToken(SUBANNOTATION_DIRECTIVE); }
198    ".end subannotation" { return newToken(END_SUBANNOTATION_DIRECTIVE); }
199    ".annotation" { return newToken(ANNOTATION_DIRECTIVE); }
200    ".end annotation" { return newToken(END_ANNOTATION_DIRECTIVE); }
201    ".enum" { return newToken(ENUM_DIRECTIVE); }
202    ".method" { return newToken(METHOD_DIRECTIVE); }
203    ".end method" { return newToken(END_METHOD_DIRECTIVE); }
204    ".registers" { return newToken(REGISTERS_DIRECTIVE); }
205    ".locals" { return newToken(LOCALS_DIRECTIVE); }
206    ".array-data" { return newToken(ARRAY_DATA_DIRECTIVE); }
207    ".end array-data" { return newToken(END_ARRAY_DATA_DIRECTIVE); }
208    ".packed-switch" { return newToken(PACKED_SWITCH_DIRECTIVE); }
209    ".end packed-switch" { return newToken(END_PACKED_SWITCH_DIRECTIVE); }
210    ".sparse-switch" { return newToken(SPARSE_SWITCH_DIRECTIVE); }
211    ".end sparse-switch" { return newToken(END_SPARSE_SWITCH_DIRECTIVE); }
212    ".catch" { return newToken(CATCH_DIRECTIVE); }
213    ".catchall" { return newToken(CATCHALL_DIRECTIVE); }
214    ".line" { return newToken(LINE_DIRECTIVE); }
215    ".parameter" { return newToken(PARAMETER_DIRECTIVE); }
216    ".end parameter" { return newToken(END_PARAMETER_DIRECTIVE); }
217    ".local" { return newToken(LOCAL_DIRECTIVE); }
218    ".end local" { return newToken(END_LOCAL_DIRECTIVE); }
219    ".restart local" { return newToken(RESTART_LOCAL_DIRECTIVE); }
220    ".prologue" { return newToken(PROLOGUE_DIRECTIVE); }
221    ".epilogue" { return newToken(EPILOGUE_DIRECTIVE); }
222
223    ".end" { return invalidToken("Invalid directive"); }
224    ".end " [a-zA-z0-9\-_]+ { return invalidToken("Invalid directive"); }
225    ".restart" { return invalidToken("Invalid directive"); }
226    ".restart " [a-zA-z0-9\-_]+ { return invalidToken("Invalid directive"); }
227}
228
229/*Literals*/
230<YYINITIAL> {
231    -? {Integer} { return newToken(INTEGER_LITERAL); }
232    -? {Integer} [lL] { return newToken(LONG_LITERAL); }
233    -? {Integer} [sS] { return newToken(SHORT_LITERAL); }
234    -? {Integer} [tT] { return newToken(BYTE_LITERAL); }
235
236    {FloatOrID} [fF] | -? [0-9]+ [fF] { return newToken(FLOAT_LITERAL_OR_ID); }
237    {FloatOrID} [dD]? | -? [0-9]+ [dD] { return newToken(DOUBLE_LITERAL_OR_ID); }
238    {Float} [fF] { return newToken(FLOAT_LITERAL); }
239    {Float} [dD]? { return newToken(DOUBLE_LITERAL); }
240
241    "true"|"false" { return newToken(BOOL_LITERAL); }
242    "null" { return newToken(NULL_LITERAL); }
243
244    "\"" { beginStringOrChar(STRING); sb.append('"'); }
245
246    ' { beginStringOrChar(CHAR); sb.append('\''); }
247}
248
249<STRING> {
250    "\""  { sb.append('"'); return endStringOrChar(STRING_LITERAL); }
251
252    [^\r\n\"\\]+ { sb.append(yytext()); }
253    "\\b" { sb.append('\b'); }
254    "\\t" { sb.append('\t'); }
255    "\\n" { sb.append('\n'); }
256    "\\f" { sb.append('\f'); }
257    "\\r" { sb.append('\r'); }
258    "\\'" { sb.append('\''); }
259    "\\\"" { sb.append('"'); }
260    "\\\\" { sb.append('\\'); }
261    "\\u" {HexDigits} { sb.append((char)Integer.parseInt(yytext().substring(2,6), 16)); }
262
263    "\\u" {FewerHexDigits} {
264        sb.append(yytext());
265        setStringOrCharError("Invalid \\u sequence. \\u must be followed by 4 hex digits");
266    }
267
268    "\\" [^btnfr'\"\\u] {
269        sb.append(yytext());
270        setStringOrCharError("Invalid escape sequence " + yytext());
271    }
272
273    [\r\n] { return invalidStringOrChar("Unterminated string literal"); }
274    <<EOF>> { return invalidStringOrChar("Unterminated string literal"); }
275}
276
277<CHAR> {
278    ' {
279        sb.append('\'');
280        if (sb.length() == 2) {
281            return invalidStringOrChar("Empty character literal");
282        } else if (sb.length() > 3) {
283            return invalidStringOrChar("Character literal with multiple chars");
284        }
285
286        return endStringOrChar(CHAR_LITERAL);
287    }
288
289    [^\r\n'\\]+ { sb.append(yytext()); }
290    "\\b" { sb.append('\b'); }
291    "\\t" { sb.append('\t'); }
292    "\\n" { sb.append('\n'); }
293    "\\f" { sb.append('\f'); }
294    "\\r" { sb.append('\r'); }
295    "\\'" { sb.append('\''); }
296    "\\\"" { sb.append('"'); }
297    "\\\\" { sb.append('\\'); }
298    "\\u" {HexDigits} { sb.append((char)Integer.parseInt(yytext().substring(2,6), 16)); }
299
300    "\\u" {HexDigit}* {
301        sb.append(yytext());
302        setStringOrCharError("Invalid \\u sequence. \\u must be followed by exactly 4 hex digits");
303    }
304
305    "\\" [^btnfr'\"\\u] {
306        sb.append(yytext());
307        setStringOrCharError("Invalid escape sequence " + yytext());
308    }
309
310    [\r\n] { return invalidStringOrChar("Unterminated character literal"); }
311    <<EOF>> { return invalidStringOrChar("Unterminated character literal"); }
312}
313
314/*Misc*/
315<YYINITIAL> {
316    [vp] [0-9]+ { return newToken(REGISTER); }
317
318    "build" | "runtime" | "system" {
319        return newToken(ANNOTATION_VISIBILITY);
320    }
321
322    "public" | "private" | "protected" | "static" | "final" | "synchronized" | "bridge" | "varargs" | "native" |
323    "abstract" | "strictfp" | "synthetic" | "constructor" | "declared-synchronized" | "interface" | "enum" |
324    "annotation" | "volatile" | "transient" {
325        return newToken(ACCESS_SPEC);
326    }
327
328    "vtable@0x" {HexDigit}+ { return newToken(VTABLE_OFFSET); }
329    "field@0x" {HexDigit}+ { return newToken(FIELD_OFFSET); }
330
331    "+" {Integer} { return newToken(OFFSET); }
332
333    # [^\r\n]* ("\r\n" | "\r" | "\n")? { return newToken(LINE_COMMENT, true); }
334}
335
336/*Instructions*/
337<YYINITIAL> {
338    "goto" {
339        return newToken(INSTRUCTION_FORMAT10t);
340    }
341
342    "return-void" | "nop" {
343        return newToken(INSTRUCTION_FORMAT10x);
344    }
345
346    "const/4" {
347        return newToken(INSTRUCTION_FORMAT11n);
348    }
349
350    "move-result" | "move-result-wide" | "move-result-object" | "move-exception" | "return" | "return-wide" |
351    "return-object" | "monitor-enter" | "monitor-exit" | "throw" {
352        return newToken(INSTRUCTION_FORMAT11x);
353    }
354
355    "move" | "move-wide" | "move-object" | "array-length" | "neg-int" | "not-int" | "neg-long" | "not-long" |
356    "neg-float" | "neg-double" | "int-to-long" | "int-to-float" | "int-to-double" | "long-to-int" | "long-to-float" |
357    "long-to-double" | "float-to-int" | "float-to-long" | "float-to-double" | "double-to-int" | "double-to-long" |
358    "double-to-float" | "int-to-byte" | "int-to-char" | "int-to-short" {
359        return newToken(INSTRUCTION_FORMAT12x_OR_ID);
360    }
361
362    "add-int/2addr" | "sub-int/2addr" | "mul-int/2addr" | "div-int/2addr" | "rem-int/2addr" | "and-int/2addr" |
363    "or-int/2addr" | "xor-int/2addr" | "shl-int/2addr" | "shr-int/2addr" | "ushr-int/2addr" | "add-long/2addr" |
364    "sub-long/2addr" | "mul-long/2addr" | "div-long/2addr" | "rem-long/2addr" | "and-long/2addr" | "or-long/2addr" |
365    "xor-long/2addr" | "shl-long/2addr" | "shr-long/2addr" | "ushr-long/2addr" | "add-float/2addr" |
366    "sub-float/2addr" | "mul-float/2addr" | "div-float/2addr" | "rem-float/2addr" | "add-double/2addr" |
367    "sub-double/2addr" | "mul-double/2addr" | "div-double/2addr" | "rem-double/2addr" {
368        return newToken(INSTRUCTION_FORMAT12x);
369    }
370
371    "goto/16" {
372        return newToken(INSTRUCTION_FORMAT20t);
373    }
374
375    "sget" | "sget-wide" | "sget-object" | "sget-boolean" | "sget-byte" | "sget-char" | "sget-short" | "sput" |
376    "sput-wide" | "sput-object" | "sput-boolean" | "sput-byte" | "sput-char" | "sput-short" {
377        return newToken(INSTRUCTION_FORMAT21c_FIELD);
378    }
379
380    "const-string" {
381        return newToken(INSTRUCTION_FORMAT21c_STRING);
382    }
383
384    "check-cast" | "new-instance" | "const-class" {
385        return newToken(INSTRUCTION_FORMAT21c_TYPE);
386    }
387
388    "const/high16" | "const-wide/high16" {
389        return newToken(INSTRUCTION_FORMAT21h);
390    }
391
392    "const/16" | "const-wide/16" {
393        return newToken(INSTRUCTION_FORMAT21s);
394    }
395
396    "if-eqz" | "if-nez" | "if-ltz" | "if-gez" | "if-gtz" | "if-lez" {
397        return newToken(INSTRUCTION_FORMAT21t);
398    }
399
400    "add-int/lit8" | "rsub-int/lit8" | "mul-int/lit8" | "div-int/lit8" | "rem-int/lit8" | "and-int/lit8" |
401    "or-int/lit8" | "xor-int/lit8" | "shl-int/lit8" | "shr-int/lit8" | "ushr-int/lit8" {
402        return newToken(INSTRUCTION_FORMAT22b);
403    }
404
405    "iget" | "iget-wide" | "iget-object" | "iget-boolean" | "iget-byte" | "iget-char" | "iget-short" | "iput" |
406    "iput-wide" | "iput-object" | "iput-boolean" | "iput-byte" | "iput-char" | "iput-short" {
407        return newToken(INSTRUCTION_FORMAT22c_FIELD);
408    }
409
410    "instance-of" | "new-array" {
411        return newToken(INSTRUCTION_FORMAT22c_TYPE);
412    }
413
414    "iget-quick" | "iget-wide-quick" | "iget-object-quick" | "iput-quick" | "iput-wide-quick" | "iput-object-quick" {
415        return newToken(INSTRUCTION_FORMAT22cs_FIELD);
416    }
417
418    "rsub-int" {
419        return newToken(INSTRUCTION_FORMAT22s_OR_ID);
420    }
421
422    "add-int/lit16" | "mul-int/lit16" | "div-int/lit16" | "rem-int/lit16" | "and-int/lit16" | "or-int/lit16" |
423    "xor-int/lit16" {
424        return newToken(INSTRUCTION_FORMAT22s);
425    }
426
427    "if-eq" | "if-ne" | "if-lt" | "if-ge" | "if-gt" | "if-le" {
428        return newToken(INSTRUCTION_FORMAT22t);
429    }
430
431    "move/from16" | "move-wide/from16" | "move-object/from16" {
432        return newToken(INSTRUCTION_FORMAT22x);
433    }
434
435    "cmpl-float" | "cmpg-float" | "cmpl-double" | "cmpg-double" | "cmp-long" | "aget" | "aget-wide" | "aget-object" |
436    "aget-boolean" | "aget-byte" | "aget-char" | "aget-short" | "aput" | "aput-wide" | "aput-object" | "aput-boolean" |
437    "aput-byte" | "aput-char" | "aput-short" | "add-int" | "sub-int" | "mul-int" | "div-int" | "rem-int" | "and-int" |
438    "or-int" | "xor-int" | "shl-int" | "shr-int" | "ushr-int" | "add-long" | "sub-long" | "mul-long" | "div-long" |
439    "rem-long" | "and-long" | "or-long" | "xor-long" | "shl-long" | "shr-long" | "ushr-long" | "add-float" |
440    "sub-float" | "mul-float" | "div-float" | "rem-float" | "add-double" | "sub-double" | "mul-double" | "div-double" |
441    "rem-double" {
442        return newToken(INSTRUCTION_FORMAT23x);
443    }
444
445    "goto/32" {
446        return newToken(INSTRUCTION_FORMAT30t);
447    }
448
449    "const-string/jumbo" {
450        return newToken(INSTRUCTION_FORMAT31c);
451    }
452
453    "const" {
454        return newToken(INSTRUCTION_FORMAT31i_OR_ID);
455    }
456
457    "const-wide/32" {
458        return newToken(INSTRUCTION_FORMAT31i);
459    }
460
461    "fill-array-data" | "packed-switch" | "sparse-switch" {
462        return newToken(INSTRUCTION_FORMAT31t);
463    }
464
465    "move/16" | "move-wide/16" | "move-object/16" {
466        return newToken(INSTRUCTION_FORMAT32x);
467    }
468
469    "invoke-virtual" | "invoke-super" | "invoke-direct" | "invoke-static" | "invoke-interface" {
470        return newToken(INSTRUCTION_FORMAT35c_METHOD);
471    }
472
473    "filled-new-array" {
474        return newToken(INSTRUCTION_FORMAT35c_TYPE);
475    }
476
477    "invoke-direct-empty" {
478        return newToken(INSTRUCTION_FORMAT35s_METHOD);
479    }
480
481    "execute-inline" | "invoke-virtual-quick" | "invoke-super-quick" {
482        return newToken(INSTRUCTION_FORMAT35ms_METHOD);
483    }
484
485    "invoke-virtual/range" | "invoke-super/range" | "invoke-direct/range" | "invoke-static/range" |
486    "invoke-interface/range" {
487        return newToken(INSTRUCTION_FORMAT3rc_METHOD);
488    }
489
490    "filled-new-array/range" {
491        return newToken(INSTRUCTION_FORMAT3rc_TYPE);
492    }
493
494    "invoke-virtual-quick/range" | "invoke-super-quick/range" {
495        return newToken(INSTRUCTION_FORMAT3rms_METHOD);
496    }
497
498    "const-wide" {
499        return newToken(INSTRUCTION_FORMAT51l);
500    }
501}
502
503/*Types*/
504<YYINITIAL> {
505    {PrimitiveType} { return newToken(PRIMITIVE_TYPE); }
506    V { return newToken(VOID_TYPE); }
507    {ClassDescriptor} { return newToken(CLASS_DESCRIPTOR); }
508    {ArrayDescriptor} { return newToken(ARRAY_DESCRIPTOR); }
509    {PrimitiveType} {PrimitiveType}+ { return newToken(PARAM_LIST_OR_ID); }
510    {Type} {Type}+ { return newToken(PARAM_LIST); }
511    {SimpleName} { return newToken(SIMPLE_NAME); }
512    "<init>" | "<clinit>" { return newToken(METHOD_NAME); }
513}
514
515/*Symbols/Whitespace/EOF*/
516<YYINITIAL> {
517    ".." { return newToken(DOTDOT); }
518    "->" { return newToken(ARROW); }
519    "=" { return newToken(EQUAL); }
520    ":" { return newToken(COLON); }
521    "," { return newToken(COMMA); }
522    "{" { return newToken(OPEN_BRACE); }
523    "}" { return newToken(CLOSE_BRACE); }
524    "(" { return newToken(OPEN_PAREN); }
525    ")" { return newToken(CLOSE_PAREN); }
526    [\r\n\t ]+ { return newToken(WHITE_SPACE, true); }
527    <<EOF>> { return newToken(EOF); }
528}
529
530/*catch all*/
531<YYINITIAL> {
532    "." { return invalidToken("Invalid directive"); }
533    "." [a-zA-z\-_] { return invalidToken("Invalid directive"); }
534    "." [a-zA-z\-_] [a-zA-z0-9\-_]* { return invalidToken("Invalid directive"); }
535    . { return invalidToken("Invalid text"); }
536}
537