1package java_cup;
2
3import java.util.Hashtable;
4
5import java_cup.runtime.str_token;
6import java_cup.runtime.token;
7
8/** This class implements a small scanner (aka lexical analyzer or lexer) for
9 *  the JavaCup specification.  This scanner reads characters from standard
10 *  input (System.in) and returns integers corresponding to the terminal
11 *  number of the next token.  Once end of input is reached the EOF token is
12 *  returned on every subsequent call.<p>
13 *  Tokens currently returned include: <pre>
14 *    Symbol        Constant Returned     Symbol        Constant Returned
15 *    ------        -----------------     ------        -----------------
16 *    "package"     PACKAGE               "import"      IMPORT
17 *    "code"        CODE                  "action"      ACTION
18 *    "parser"      PARSER                "terminal"    TERMINAL
19 *    "non"         NON                   "init"        INIT
20 *    "scan"        SCAN                  "with"        WITH
21 *    "start"       START                   ;           SEMI
22 *      ,           COMMA                   *           STAR
23 *      .           DOT                     :           COLON
24 *      ::=         COLON_COLON_EQUALS      |           BAR
25 *    identifier    ID                    {:...:}       CODE_STRING
26 *    "debug"       DEBUG
27 *  </pre>
28 *  All symbol constants are defined in sym.java which is generated by
29 *  JavaCup from parser.cup.<p>
30 *
31 *  In addition to the scanner proper (called first via init() then with
32 *  next_token() to get each token) this class provides simple error and
33 *  warning routines and keeps a count of errors and warnings that is
34 *  publicly accessible.<p>
35 *
36 *  This class is "static" (i.e., it has only static members and methods).
37 *
38 * @version last updated: 11/25/95
39 * @author  Scott Hudson
40 */
41public class lexer {
42
43  /*-----------------------------------------------------------*/
44  /*--- Constructor(s) ----------------------------------------*/
45  /*-----------------------------------------------------------*/
46
47  /** The only constructor is private, so no instances can be created. */
48  private lexer() { }
49
50  /*-----------------------------------------------------------*/
51  /*--- Static (Class) Variables ------------------------------*/
52  /*-----------------------------------------------------------*/
53
54  /** First character of lookahead. */
55  protected static int next_char;
56
57  /** Second character of lookahead. */
58  protected static int next_char2;
59
60  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
61
62  /** EOF constant. */
63  protected static final int EOF_CHAR = -1;
64
65  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
66
67  /** Table of keywords.  Keywords are initially treated as identifiers.
68   *  Just before they are returned we look them up in this table to see if
69   *  they match one of the keywords.  The string of the name is the key here,
70   *  which indexes Integer objects holding the symbol number.
71   */
72  protected static Hashtable keywords = new Hashtable(23);
73
74  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
75
76  /** Table of single character symbols.  For ease of implementation, we
77   *  store all unambiguous single character tokens in this table of Integer
78   *  objects keyed by Integer objects with the numerical value of the
79   *  appropriate char (currently Character objects have a bug which precludes
80   *  their use in tables).
81   */
82  protected static Hashtable char_symbols = new Hashtable(11);
83
84  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
85
86  /** Current line number for use in error messages. */
87  protected static int current_line = 1;
88
89  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
90
91  /** Character position in current line. */
92  protected static int current_position = 1;
93
94  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
95
96  /** Count of total errors detected so far. */
97  public static int error_count = 0;
98
99  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
100
101  /** Count of warnings issued so far */
102  public static int warning_count = 0;
103
104  /*-----------------------------------------------------------*/
105  /*--- Static Methods ----------------------------------------*/
106  /*-----------------------------------------------------------*/
107
108  /** Initialize the scanner.  This sets up the keywords and char_symbols
109    * tables and reads the first two characters of lookahead.
110    */
111  public static void init() throws java.io.IOException
112    {
113      /* set up the keyword table */
114      keywords.put("package",  new Integer(sym.PACKAGE));
115      keywords.put("import",   new Integer(sym.IMPORT));
116      keywords.put("code",     new Integer(sym.CODE));
117      keywords.put("action",   new Integer(sym.ACTION));
118      keywords.put("parser",   new Integer(sym.PARSER));
119      keywords.put("terminal", new Integer(sym.TERMINAL));
120      keywords.put("non",      new Integer(sym.NON));
121      keywords.put("init",     new Integer(sym.INIT));
122      keywords.put("scan",     new Integer(sym.SCAN));
123      keywords.put("with",     new Integer(sym.WITH));
124      keywords.put("start",    new Integer(sym.START));
125      keywords.put("debug",    new Integer(sym.DEBUG));
126
127      /* set up the table of single character symbols */
128      char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
129      char_symbols.put(new Integer(','), new Integer(sym.COMMA));
130      char_symbols.put(new Integer('*'), new Integer(sym.STAR));
131      char_symbols.put(new Integer('.'), new Integer(sym.DOT));
132      char_symbols.put(new Integer('|'), new Integer(sym.BAR));
133
134      /* read two characters of lookahead */
135      next_char = System.in.read();
136      if (next_char == EOF_CHAR)
137    next_char2 = EOF_CHAR;
138      else
139    next_char2 = System.in.read();
140    }
141
142  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
143
144  /** Advance the scanner one character in the input stream.  This moves
145   * next_char2 to next_char and then reads a new next_char2.
146   */
147  protected static void advance() throws java.io.IOException
148    {
149      int old_char;
150
151      old_char = next_char;
152      next_char = next_char2;
153      if (next_char == EOF_CHAR)
154    next_char2 = EOF_CHAR;
155      else
156    next_char2 = System.in.read();
157
158      /* count this */
159      current_position++;
160      if (old_char == '\n')
161    {
162      current_line++;
163      current_position = 1;
164    }
165    }
166
167  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
168
169  /** Emit an error message.  The message will be marked with both the
170   *  current line number and the position in the line.  Error messages
171   *  are printed on standard error (System.err).
172   * @param message the message to print.
173   */
174  public static void emit_error(String message)
175    {
176      System.err.println("Error at " + current_line + "(" + current_position +
177             "): " + message);
178      error_count++;
179    }
180
181  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
182
183  /** Emit a warning message.  The message will be marked with both the
184   *  current line number and the position in the line.  Messages are
185   *  printed on standard error (System.err).
186   * @param message the message to print.
187   */
188  public static void emit_warn(String message)
189    {
190      System.err.println("Warning at " + current_line + "(" + current_position +
191             "): " + message);
192      warning_count++;
193    }
194
195  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
196
197  /** Determine if a character is ok to start an id.
198   * @param ch the character in question.
199   */
200  protected static boolean id_start_char(int ch)
201    {
202      return (ch >= 'a' &&  ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
203         (ch == '_');
204
205      // later need to deal with non-8-bit chars here
206    }
207
208  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
209
210  /** Determine if a character is ok for the middle of an id.
211   * @param ch the character in question.
212   */
213  protected static boolean id_char(int ch)
214    {
215      return id_start_char(ch) || (ch >= '0' && ch <= '9');
216    }
217
218  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
219
220  /** Try to look up a single character symbol, returns -1 for not found.
221   * @param ch the character in question.
222   */
223  protected static int find_single_char(int ch)
224    {
225      Integer result;
226
227      result = (Integer)char_symbols.get(new Integer((char)ch));
228      if (result == null)
229    return -1;
230      else
231    return result.intValue();
232    }
233
234  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
235
236  /** Handle swallowing up a comment.  Both old style C and new style C++
237   *  comments are handled.
238   */
239  protected static void swallow_comment() throws java.io.IOException
240    {
241      /* next_char == '/' at this point */
242
243      /* is it a traditional comment */
244      if (next_char2 == '*')
245    {
246      /* swallow the opener */
247      advance(); advance();
248
249      /* swallow the comment until end of comment or EOF */
250      for (;;)
251        {
252          /* if its EOF we have an error */
253          if (next_char == EOF_CHAR)
254        {
255          emit_error("Specification file ends inside a comment");
256          return;
257        }
258
259          /* if we can see the closer we are done */
260          if (next_char == '*' && next_char2 == '/')
261        {
262          advance();
263          advance();
264          return;
265        }
266
267          /* otherwise swallow char and move on */
268          advance();
269        }
270    }
271
272      /* is its a new style comment */
273      if (next_char2 == '/')
274    {
275      /* swallow the opener */
276      advance(); advance();
277
278      /* swallow to '\n', '\f', or EOF */
279      while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
280        advance();
281
282      return;
283
284    }
285
286      /* shouldn't get here, but... if we get here we have an error */
287      emit_error("Malformed comment in specification -- ignored");
288      advance();
289    }
290
291  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
292
293  /** Swallow up a code string.  Code strings begin with "{:" and include
294      all characters up to the first occurrence of ":}" (there is no way to
295      include ":}" inside a code string).  The routine returns an str_token
296      object suitable for return by the scanner.
297   */
298  protected static token do_code_string() throws java.io.IOException
299    {
300      StringBuffer result = new StringBuffer();
301
302      /* at this point we have lookahead of "{:" -- swallow that */
303      advance(); advance();
304
305      /* save chars until we see ":}" */
306      while (!(next_char == ':' && next_char2 == '}'))
307    {
308      /* if we have run off the end issue a message and break out of loop */
309      if (next_char == EOF_CHAR)
310        {
311          emit_error("Specification file ends inside a code string");
312          break;
313        }
314
315      /* otherwise record the char and move on */
316      result.append(new Character((char)next_char));
317      advance();
318    }
319
320      /* advance past the closer and build a return token */
321      advance(); advance();
322      return new str_token(sym.CODE_STRING, result.toString());
323    }
324
325  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
326
327  /** Process an identifier.  Identifiers begin with a letter, underscore,
328   *  or dollar sign, which is followed by zero or more letters, numbers,
329   *  underscores or dollar signs.  This routine returns an str_token suitable
330   *  for return by the scanner.
331   */
332  protected static token do_id() throws java.io.IOException
333    {
334      StringBuffer result = new StringBuffer();
335      String       result_str;
336      Integer      keyword_num;
337      char         buffer[] = new char[1];
338
339      /* next_char holds first character of id */
340      buffer[0] = (char)next_char;
341      result.append(buffer,0,1);
342      advance();
343
344      /* collect up characters while they fit in id */
345      while(id_char(next_char))
346    {
347          buffer[0] = (char)next_char;
348      result.append(buffer,0,1);
349      advance();
350    }
351
352      /* extract a string and try to look it up as a keyword */
353      result_str = result.toString();
354      keyword_num = (Integer)keywords.get(result_str);
355
356      /* if we found something, return that keyword */
357      if (keyword_num != null)
358    return new token(keyword_num.intValue());
359
360      /* otherwise build and return an id token with an attached string */
361      return new str_token(sym.ID, result_str);
362    }
363
364  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
365
366  /** Return one token.  This is the main external interface to the scanner.
367   *  It consumes sufficient characters to determine the next input token
368   *  and returns it.  To help with debugging, this routine actually calls
369   *  real_next_token() which does the work.  If you need to debug the
370   *  parser, this can be changed to call debug_next_token() which prints
371   *  a debugging message before returning the token.
372   */
373  public static token next_token() throws java.io.IOException
374    {
375      return real_next_token();
376    }
377
378  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
379
380  /** Debugging version of next_token().  This routine calls the real scanning
381   *  routine, prints a message on System.out indicating what the token is,
382   *  then returns it.
383   */
384  public static token debug_next_token() throws java.io.IOException
385    {
386      token result = real_next_token();
387      System.out.println("# next_token() => " + result.sym);
388      return result;
389    }
390
391  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
392
393  /** The actual routine to return one token.  This is normally called from
394   *  next_token(), but for debugging purposes can be called indirectly from
395   *  debug_next_token().
396   */
397  protected static token real_next_token() throws java.io.IOException
398    {
399      int sym_num;
400
401      for (;;)
402    {
403      /* look for white space */
404      if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
405          next_char == '\f' ||  next_char == '\r')
406        {
407          /* advance past it and try the next character */
408          advance();
409          continue;
410        }
411
412      /* look for a single character symbol */
413      sym_num = find_single_char(next_char);
414      if (sym_num != -1)
415        {
416          /* found one -- advance past it and return a token for it */
417          advance();
418          return new token(sym_num);
419        }
420
421      /* look for : or ::= */
422      if (next_char == ':')
423        {
424          /* if we don't have a second ':' return COLON */
425          if (next_char2 != ':')
426        {
427          advance();
428          return new token(sym.COLON);
429        }
430
431          /* move forward and look for the '=' */
432          advance();
433          if (next_char2 == '=')
434        {
435          advance(); advance();
436          return new token(sym.COLON_COLON_EQUALS);
437        }
438          else
439        {
440          /* return just the colon (already consumed) */
441          return new token(sym.COLON);
442        }
443        }
444
445      /* look for a comment */
446      if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
447        {
448          /* swallow then continue the scan */
449          swallow_comment();
450          continue;
451        }
452
453      /* look for start of code string */
454      if (next_char == '{' && next_char2 == ':')
455        return do_code_string();
456
457      /* look for an id or keyword */
458      if (id_start_char(next_char)) return do_id();
459
460      /* look for EOF */
461      if (next_char == EOF_CHAR) return new token(sym.EOF);
462
463      /* if we get here, we have an unrecognized character */
464      emit_warn("Unrecognized character '" +
465        new Character((char)next_char) + "'(" + next_char +
466        ") -- ignored");
467
468      /* advance past it */
469      advance();
470    }
471    }
472
473  /*-----------------------------------------------------------*/
474};
475
476