StreamTokenizer.java revision dd828f42a5c83b4270d4fbf6fce2da1878f1e84a
1/*
2 *  Licensed to the Apache Software Foundation (ASF) under one or more
3 *  contributor license agreements.  See the NOTICE file distributed with
4 *  this work for additional information regarding copyright ownership.
5 *  The ASF licenses this file to You under the Apache License, Version 2.0
6 *  (the "License"); you may not use this file except in compliance with
7 *  the License.  You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *  Unless required by applicable law or agreed to in writing, software
12 *  distributed under the License is distributed on an "AS IS" BASIS,
13 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *  See the License for the specific language governing permissions and
15 *  limitations under the License.
16 */
17
18package java.io;
19
20/**
21 * Parses a stream into a set of defined tokens, one at a time. The different
22 * types of tokens that can be found are numbers, identifiers, quoted strings,
23 * and different comment styles. The class can be used for limited processing
24 * of source code of programming languages like Java, although it is nowhere
25 * near a full parser.
26 *
27 * @since Android 1.0
28 */
29public class StreamTokenizer {
30    /**
31     * Contains a number if the current token is a number ({@code ttype} ==
32     * {@code TT_NUMBER}).
33     *
34     * @since Android 1.0
35     */
36    public double nval;
37
38    /**
39     * Contains a string if the current token is a word ({@code ttype} ==
40     * {@code TT_WORD}).
41     *
42     * @since Android 1.0
43     */
44    public String sval;
45
46    /**
47     * The constant representing the end of the stream.
48     *
49     * @since Android 1.0
50     */
51    public static final int TT_EOF = -1;
52
53    /**
54     * The constant representing the end of the line.
55     *
56     * @since Android 1.0
57     */
58    public static final int TT_EOL = '\n';
59
60    /**
61     * The constant representing a number token.
62     *
63     * @since Android 1.0
64     */
65    public static final int TT_NUMBER = -2;
66
67    /**
68     * The constant representing a word token.
69     *
70     * @since Android 1.0
71     */
72    public static final int TT_WORD = -3;
73
74    /**
75     * Internal representation of unknown state.
76     */
77    private static final int TT_UNKNOWN = -4;
78
79    /**
80     * After calling {@code nextToken()}, {@code ttype} contains the type of
81     * token that has been read. When a single character is read, its value
82     * converted to an integer is stored in {@code ttype}. For a quoted string,
83     * the value is the quoted character. Otherwise, its value is one of the
84     * following:
85     * <ul>
86     * <li> {@code TT_WORD} - the token is a word.</li>
87     * <li> {@code TT_NUMBER} - the token is a number.</li>
88     * <li> {@code TT_EOL} - the end of line has been reached. Depends on
89     * whether {@code eolIsSignificant} is {@code true}.</li>
90     * <li> {@code TT_EOF} - the end of the stream has been reached.</li>
91     * </ul>
92     *
93     * @since Android 1.0
94     */
95    public int ttype = TT_UNKNOWN;
96
97    /**
98     * Internal character meanings, 0 implies TOKEN_ORDINARY
99     */
100    private byte tokenTypes[] = new byte[256];
101
102    private static final byte TOKEN_COMMENT = 1;
103
104    private static final byte TOKEN_QUOTE = 2;
105
106    private static final byte TOKEN_WHITE = 4;
107
108    private static final byte TOKEN_WORD = 8;
109
110    private static final byte TOKEN_DIGIT = 16;
111
112    private int lineNumber = 1;
113
114    private boolean forceLowercase;
115
116    private boolean isEOLSignificant;
117
118    private boolean slashStarComments;
119
120    private boolean slashSlashComments;
121
122    private boolean pushBackToken;
123
124    private boolean lastCr;
125
126    /* One of these will have the stream */
127    private InputStream inStream;
128
129    private Reader inReader;
130
131    private int peekChar = -2;
132
133    /**
134     * Private constructor to initialize the default values according to the
135     * specification.
136     */
137    private StreamTokenizer() {
138        /**
139         * Initialize the default state per specification. All byte values 'A'
140         * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are
141         * considered to be alphabetic.
142         */
143        wordChars('A', 'Z');
144        wordChars('a', 'z');
145        wordChars(160, 255);
146        /**
147         * All byte values '\u0000' through '\u0020' are considered to be white
148         * space.
149         */
150        whitespaceChars(0, 32);
151        /**
152         * '/' is a comment character. Single quote '\'' and double quote '"'
153         * are string quote characters.
154         */
155        commentChar('/');
156        quoteChar('"');
157        quoteChar('\'');
158        /**
159         * Numbers are parsed.
160         */
161        parseNumbers();
162        /**
163         * Ends of lines are treated as white space, not as separate tokens.
164         * C-style and C++-style comments are not recognized. These are the
165         * defaults and are not needed in constructor.
166         */
167    }
168
169    /**
170     * Constructs a new {@code StreamTokenizer} with {@code is} as source input
171     * stream. This constructor is deprecated; instead, the constructor that
172     * takes a {@code Reader} as an arugment should be used.
173     *
174     * @param is
175     *            the source stream from which to parse tokens.
176     * @throws NullPointerException
177     *             if {@code is} is {@code null}.
178     * @deprecated Use {@link #StreamTokenizer(Reader)}
179     * @since Android 1.0
180     */
181    @Deprecated
182    public StreamTokenizer(InputStream is) {
183        this();
184        if (is == null) {
185            throw new NullPointerException();
186        }
187        inStream = is;
188    }
189
190    /**
191     * Constructs a new {@code StreamTokenizer} with {@code r} as source reader.
192     * The tokenizer's initial state is as follows:
193     * <ul>
194     * <li>All byte values 'A' through 'Z', 'a' through 'z', and '&#92;u00A0'
195     * through '&#92;u00FF' are considered to be alphabetic.</li>
196     * <li>All byte values '&#92;u0000' through '&#92;u0020' are considered to
197     * be white space. '/' is a comment character.</li>
198     * <li>Single quote '\'' and double quote '"' are string quote characters.
199     * </li>
200     * <li>Numbers are parsed.</li>
201     * <li>End of lines are considered to be white space rather than separate
202     * tokens.</li>
203     * <li>C-style and C++-style comments are not recognized.</LI>
204     * </ul>
205     *
206     * @param r
207     *            the source reader from which to parse tokens.
208     * @since Android 1.0
209     */
210    public StreamTokenizer(Reader r) {
211        this();
212        if (r == null) {
213            throw new NullPointerException();
214        }
215        inReader = r;
216    }
217
218    /**
219     * Specifies that the character {@code ch} shall be treated as a comment
220     * character.
221     *
222     * @param ch
223     *            the character to be considered a comment character.
224     * @since Android 1.0
225     */
226    public void commentChar(int ch) {
227        if (0 <= ch && ch < tokenTypes.length) {
228            tokenTypes[ch] = TOKEN_COMMENT;
229        }
230    }
231
232    /**
233     * Specifies whether the end of a line is significant and should be returned
234     * as {@code TT_EOF} in {@code ttype} by this tokenizer.
235     *
236     * @param flag
237     *            {@code true} if EOL is significant, {@code false} otherwise.
238     * @since Android 1.0
239     */
240    public void eolIsSignificant(boolean flag) {
241        isEOLSignificant = flag;
242    }
243
244    /**
245     * Returns the current line number.
246     *
247     * @return this tokenizer's current line number.
248     * @since Android 1.0
249     */
250    public int lineno() {
251        return lineNumber;
252    }
253
254    /**
255     * Specifies whether word tokens should be converted to lower case when they
256     * are stored in {@code sval}.
257     *
258     * @param flag
259     *            {@code true} if {@code sval} should be converted to lower
260     *            case, {@code false} otherwise.
261     * @since Android 1.0
262     */
263    public void lowerCaseMode(boolean flag) {
264        forceLowercase = flag;
265    }
266
267    /**
268     * Parses the next token from this tokenizer's source stream or reader. The
269     * type of the token is stored in the {@code ttype} field, additional
270     * information may be stored in the {@code nval} or {@code sval} fields.
271     *
272     * @return the value of {@code ttype}.
273     * @throws IOException
274     *             if an I/O error occurs while parsing the next token.
275     * @since Android 1.0
276     */
277    public int nextToken() throws IOException {
278        if (pushBackToken) {
279            pushBackToken = false;
280            if (ttype != TT_UNKNOWN) {
281                return ttype;
282            }
283        }
284        sval = null; // Always reset sval to null
285        int currentChar = peekChar == -2 ? read() : peekChar;
286
287        if (lastCr && currentChar == '\n') {
288            lastCr = false;
289            currentChar = read();
290        }
291        if (currentChar == -1) {
292            return (ttype = TT_EOF);
293        }
294
295        byte currentType = currentChar > 255 ? TOKEN_WORD
296                : tokenTypes[currentChar];
297        while ((currentType & TOKEN_WHITE) != 0) {
298            /**
299             * Skip over white space until we hit a new line or a real token
300             */
301            if (currentChar == '\r') {
302                lineNumber++;
303                if (isEOLSignificant) {
304                    lastCr = true;
305                    peekChar = -2;
306                    return (ttype = TT_EOL);
307                }
308                if ((currentChar = read()) == '\n') {
309                    currentChar = read();
310                }
311            } else if (currentChar == '\n') {
312                lineNumber++;
313                if (isEOLSignificant) {
314                    peekChar = -2;
315                    return (ttype = TT_EOL);
316                }
317                currentChar = read();
318            } else {
319                // Advance over this white space character and try again.
320                currentChar = read();
321            }
322            if (currentChar == -1) {
323                return (ttype = TT_EOF);
324            }
325            currentType = currentChar > 255 ? TOKEN_WORD
326                    : tokenTypes[currentChar];
327        }
328
329        /**
330         * Check for digits before checking for words since digits can be
331         * contained within words.
332         */
333        if ((currentType & TOKEN_DIGIT) != 0) {
334            StringBuilder digits = new StringBuilder(20);
335            boolean haveDecimal = false, checkJustNegative = currentChar == '-';
336            while (true) {
337                if (currentChar == '.') {
338                    haveDecimal = true;
339                }
340                digits.append((char) currentChar);
341                currentChar = read();
342                if ((currentChar < '0' || currentChar > '9')
343                        && (haveDecimal || currentChar != '.')) {
344                    break;
345                }
346            }
347            peekChar = currentChar;
348            if (checkJustNegative && digits.length() == 1) {
349                // Didn't get any other digits other than '-'
350                return (ttype = '-');
351            }
352            try {
353                nval = Double.valueOf(digits.toString()).doubleValue();
354            } catch (NumberFormatException e) {
355                // Unsure what to do, will write test.
356                nval = 0;
357            }
358            return (ttype = TT_NUMBER);
359        }
360        // Check for words
361        if ((currentType & TOKEN_WORD) != 0) {
362            StringBuffer word = new StringBuffer(20);
363            while (true) {
364                word.append((char) currentChar);
365                currentChar = read();
366                if (currentChar == -1
367                        || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) {
368                    break;
369                }
370            }
371            peekChar = currentChar;
372            sval = forceLowercase ? word.toString().toLowerCase() : word
373                    .toString();
374            return (ttype = TT_WORD);
375        }
376        // Check for quoted character
377        if (currentType == TOKEN_QUOTE) {
378            int matchQuote = currentChar;
379            StringBuffer quoteString = new StringBuffer();
380            int peekOne = read();
381            while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r'
382                    && peekOne != '\n') {
383                boolean readPeek = true;
384                if (peekOne == '\\') {
385                    int c1 = read();
386                    // Check for quoted octal IE: \377
387                    if (c1 <= '7' && c1 >= '0') {
388                        int digitValue = c1 - '0';
389                        c1 = read();
390                        if (c1 > '7' || c1 < '0') {
391                            readPeek = false;
392                        } else {
393                            digitValue = digitValue * 8 + (c1 - '0');
394                            c1 = read();
395                            // limit the digit value to a byte
396                            if (digitValue > 037 || c1 > '7' || c1 < '0') {
397                                readPeek = false;
398                            } else {
399                                digitValue = digitValue * 8 + (c1 - '0');
400                            }
401                        }
402                        if (!readPeek) {
403                            // We've consumed one to many
404                            quoteString.append((char) digitValue);
405                            peekOne = c1;
406                        } else {
407                            peekOne = digitValue;
408                        }
409                    } else {
410                        switch (c1) {
411                            case 'a':
412                                peekOne = 0x7;
413                                break;
414                            case 'b':
415                                peekOne = 0x8;
416                                break;
417                            case 'f':
418                                peekOne = 0xc;
419                                break;
420                            case 'n':
421                                peekOne = 0xA;
422                                break;
423                            case 'r':
424                                peekOne = 0xD;
425                                break;
426                            case 't':
427                                peekOne = 0x9;
428                                break;
429                            case 'v':
430                                peekOne = 0xB;
431                                break;
432                            default:
433                                peekOne = c1;
434                        }
435                    }
436                }
437                if (readPeek) {
438                    quoteString.append((char) peekOne);
439                    peekOne = read();
440                }
441            }
442            if (peekOne == matchQuote) {
443                peekOne = read();
444            }
445            peekChar = peekOne;
446            ttype = matchQuote;
447            sval = quoteString.toString();
448            return ttype;
449        }
450        // Do comments, both "//" and "/*stuff*/"
451        if (currentChar == '/' && (slashSlashComments || slashStarComments)) {
452            if ((currentChar = read()) == '*' && slashStarComments) {
453                int peekOne = read();
454                while (true) {
455                    currentChar = peekOne;
456                    peekOne = read();
457                    if (currentChar == -1) {
458                        peekChar = -1;
459                        return (ttype = TT_EOF);
460                    }
461                    if (currentChar == '\r') {
462                        if (peekOne == '\n') {
463                            peekOne = read();
464                        }
465                        lineNumber++;
466                    } else if (currentChar == '\n') {
467                        lineNumber++;
468                    } else if (currentChar == '*' && peekOne == '/') {
469                        peekChar = read();
470                        return nextToken();
471                    }
472                }
473            } else if (currentChar == '/' && slashSlashComments) {
474                // Skip to EOF or new line then return the next token
475                while ((currentChar = read()) >= 0 && currentChar != '\r'
476                        && currentChar != '\n') {
477                    // Intentionally empty
478                }
479                peekChar = currentChar;
480                return nextToken();
481            } else if (currentType != TOKEN_COMMENT) {
482                // Was just a slash by itself
483                peekChar = currentChar;
484                return (ttype = '/');
485            }
486        }
487        // Check for comment character
488        if (currentType == TOKEN_COMMENT) {
489            // Skip to EOF or new line then return the next token
490            while ((currentChar = read()) >= 0 && currentChar != '\r'
491                    && currentChar != '\n') {
492                // Intentionally empty
493            }
494            peekChar = currentChar;
495            return nextToken();
496        }
497
498        peekChar = read();
499        return (ttype = currentChar);
500    }
501
502    /**
503     * Specifies that the character {@code ch} shall be treated as an ordinary
504     * character by this tokenizer. That is, it has no special meaning as a
505     * comment character, word component, white space, string delimiter or
506     * number.
507     *
508     * @param ch
509     *            the character to be considered an ordinary character.
510     * @since Android 1.0
511     */
512    public void ordinaryChar(int ch) {
513        if (0 <= ch && ch < tokenTypes.length) {
514            tokenTypes[ch] = 0;
515        }
516    }
517
518    /**
519     * Specifies that the characters in the range from {@code low} to {@code hi}
520     * shall be treated as an ordinary character by this tokenizer. That is,
521     * they have no special meaning as a comment character, word component,
522     * white space, string delimiter or number.
523     *
524     * @param low
525     *            the first character in the range of ordinary characters.
526     * @param hi
527     *            the last character in the range of ordinary characters.
528     * @since Android 1.0
529     */
530    public void ordinaryChars(int low, int hi) {
531        if (low < 0) {
532            low = 0;
533        }
534        if (hi > tokenTypes.length) {
535            hi = tokenTypes.length - 1;
536        }
537        for (int i = low; i <= hi; i++) {
538            tokenTypes[i] = 0;
539        }
540    }
541
542    /**
543     * Specifies that this tokenizer shall parse numbers.
544     *
545     * @since Android 1.0
546     */
547    public void parseNumbers() {
548        for (int i = '0'; i <= '9'; i++) {
549            tokenTypes[i] |= TOKEN_DIGIT;
550        }
551        tokenTypes['.'] |= TOKEN_DIGIT;
552        tokenTypes['-'] |= TOKEN_DIGIT;
553    }
554
555    /**
556     * Indicates that the current token should be pushed back and returned again
557     * the next time {@code nextToken()} is called.
558     *
559     * @since Android 1.0
560     */
561    public void pushBack() {
562        pushBackToken = true;
563    }
564
565    /**
566     * Specifies that the character {@code ch} shall be treated as a quote
567     * character.
568     *
569     * @param ch
570     *            the character to be considered a quote character.
571     * @since Android 1.0
572     */
573    public void quoteChar(int ch) {
574        if (0 <= ch && ch < tokenTypes.length) {
575            tokenTypes[ch] = TOKEN_QUOTE;
576        }
577    }
578
579    private int read() throws IOException {
580        // Call the read for the appropriate stream
581        if (inStream == null) {
582            return inReader.read();
583        }
584        return inStream.read();
585    }
586
587    /**
588     * Specifies that all characters shall be treated as ordinary characters.
589     *
590     * @since Android 1.0
591     */
592    public void resetSyntax() {
593        for (int i = 0; i < 256; i++) {
594            tokenTypes[i] = 0;
595        }
596    }
597
598    /**
599     * Specifies whether "slash-slash" (C++-style) comments shall be recognized.
600     * This kind of comment ends at the end of the line.
601     *
602     * @param flag
603     *            {@code true} if {@code //} should be recognized as the start
604     *            of a comment, {@code false} otherwise.
605     * @since Android 1.0
606     */
607    public void slashSlashComments(boolean flag) {
608        slashSlashComments = flag;
609    }
610
611    /**
612     * Specifies whether "slash-star" (C-style) comments shall be recognized.
613     * Slash-star comments cannot be nested and end when a star-slash
614     * combination is found.
615     *
616     * @param flag
617     *            {@code true} if {@code /*} should be recognized as the start
618     *            of a comment, {@code false} otherwise.
619     * @since Android 1.0
620     */
621    public void slashStarComments(boolean flag) {
622        slashStarComments = flag;
623    }
624
625    /**
626     * Returns the state of this tokenizer in a readable format.
627     *
628     * @return the current state of this tokenizer.
629     * @since Android 1.0
630     */
631    @Override
632    public String toString() {
633        // Values determined through experimentation
634        StringBuilder result = new StringBuilder();
635        result.append("Token["); //$NON-NLS-1$
636        switch (ttype) {
637            case TT_EOF:
638                result.append("EOF"); //$NON-NLS-1$
639                break;
640            case TT_EOL:
641                result.append("EOL"); //$NON-NLS-1$
642                break;
643            case TT_NUMBER:
644                result.append("n="); //$NON-NLS-1$
645                result.append(nval);
646                break;
647            case TT_WORD:
648                result.append(sval);
649                break;
650            default:
651                // BEGIN android-changed
652                // copied from a newer version of harmony
653                if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) {
654                    result.append(sval);
655                } else {
656                    result.append('\'');
657                    result.append((char) ttype);
658                    result.append('\'');
659                }
660                // END android-changed
661        }
662        result.append("], line "); //$NON-NLS-1$
663        result.append(lineNumber);
664        return result.toString();
665    }
666
667    /**
668     * Specifies that the characters in the range from {@code low} to {@code hi}
669     * shall be treated as whitespace characters by this tokenizer.
670     *
671     * @param low
672     *            the first character in the range of whitespace characters.
673     * @param hi
674     *            the last character in the range of whitespace characters.
675     * @since Android 1.0
676     */
677    public void whitespaceChars(int low, int hi) {
678        if (low < 0) {
679            low = 0;
680        }
681        if (hi > tokenTypes.length) {
682            hi = tokenTypes.length - 1;
683        }
684        for (int i = low; i <= hi; i++) {
685            tokenTypes[i] = TOKEN_WHITE;
686        }
687    }
688
689    /**
690     * Specifies that the characters in the range from {@code low} to {@code hi}
691     * shall be treated as word characters by this tokenizer. A word consists of
692     * a word character followed by zero or more word or number characters.
693     *
694     * @param low
695     *            the first character in the range of word characters.
696     * @param hi
697     *            the last character in the range of word characters.
698     * @since Android 1.0
699     */
700    public void wordChars(int low, int hi) {
701        if (low < 0) {
702            low = 0;
703        }
704        if (hi > tokenTypes.length) {
705            hi = tokenTypes.length - 1;
706        }
707        for (int i = low; i <= hi; i++) {
708            tokenTypes[i] |= TOKEN_WORD;
709        }
710    }
711}
712