StreamTokenizer.java revision dd828f42a5c83b4270d4fbf6fce2da1878f1e84a
1/* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18package java.io; 19 20/** 21 * Parses a stream into a set of defined tokens, one at a time. The different 22 * types of tokens that can be found are numbers, identifiers, quoted strings, 23 * and different comment styles. The class can be used for limited processing 24 * of source code of programming languages like Java, although it is nowhere 25 * near a full parser. 26 * 27 * @since Android 1.0 28 */ 29public class StreamTokenizer { 30 /** 31 * Contains a number if the current token is a number ({@code ttype} == 32 * {@code TT_NUMBER}). 33 * 34 * @since Android 1.0 35 */ 36 public double nval; 37 38 /** 39 * Contains a string if the current token is a word ({@code ttype} == 40 * {@code TT_WORD}). 41 * 42 * @since Android 1.0 43 */ 44 public String sval; 45 46 /** 47 * The constant representing the end of the stream. 48 * 49 * @since Android 1.0 50 */ 51 public static final int TT_EOF = -1; 52 53 /** 54 * The constant representing the end of the line. 55 * 56 * @since Android 1.0 57 */ 58 public static final int TT_EOL = '\n'; 59 60 /** 61 * The constant representing a number token. 62 * 63 * @since Android 1.0 64 */ 65 public static final int TT_NUMBER = -2; 66 67 /** 68 * The constant representing a word token. 69 * 70 * @since Android 1.0 71 */ 72 public static final int TT_WORD = -3; 73 74 /** 75 * Internal representation of unknown state. 76 */ 77 private static final int TT_UNKNOWN = -4; 78 79 /** 80 * After calling {@code nextToken()}, {@code ttype} contains the type of 81 * token that has been read. When a single character is read, its value 82 * converted to an integer is stored in {@code ttype}. For a quoted string, 83 * the value is the quoted character. Otherwise, its value is one of the 84 * following: 85 * <ul> 86 * <li> {@code TT_WORD} - the token is a word.</li> 87 * <li> {@code TT_NUMBER} - the token is a number.</li> 88 * <li> {@code TT_EOL} - the end of line has been reached. Depends on 89 * whether {@code eolIsSignificant} is {@code true}.</li> 90 * <li> {@code TT_EOF} - the end of the stream has been reached.</li> 91 * </ul> 92 * 93 * @since Android 1.0 94 */ 95 public int ttype = TT_UNKNOWN; 96 97 /** 98 * Internal character meanings, 0 implies TOKEN_ORDINARY 99 */ 100 private byte tokenTypes[] = new byte[256]; 101 102 private static final byte TOKEN_COMMENT = 1; 103 104 private static final byte TOKEN_QUOTE = 2; 105 106 private static final byte TOKEN_WHITE = 4; 107 108 private static final byte TOKEN_WORD = 8; 109 110 private static final byte TOKEN_DIGIT = 16; 111 112 private int lineNumber = 1; 113 114 private boolean forceLowercase; 115 116 private boolean isEOLSignificant; 117 118 private boolean slashStarComments; 119 120 private boolean slashSlashComments; 121 122 private boolean pushBackToken; 123 124 private boolean lastCr; 125 126 /* One of these will have the stream */ 127 private InputStream inStream; 128 129 private Reader inReader; 130 131 private int peekChar = -2; 132 133 /** 134 * Private constructor to initialize the default values according to the 135 * specification. 136 */ 137 private StreamTokenizer() { 138 /** 139 * Initialize the default state per specification. All byte values 'A' 140 * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are 141 * considered to be alphabetic. 142 */ 143 wordChars('A', 'Z'); 144 wordChars('a', 'z'); 145 wordChars(160, 255); 146 /** 147 * All byte values '\u0000' through '\u0020' are considered to be white 148 * space. 149 */ 150 whitespaceChars(0, 32); 151 /** 152 * '/' is a comment character. Single quote '\'' and double quote '"' 153 * are string quote characters. 154 */ 155 commentChar('/'); 156 quoteChar('"'); 157 quoteChar('\''); 158 /** 159 * Numbers are parsed. 160 */ 161 parseNumbers(); 162 /** 163 * Ends of lines are treated as white space, not as separate tokens. 164 * C-style and C++-style comments are not recognized. These are the 165 * defaults and are not needed in constructor. 166 */ 167 } 168 169 /** 170 * Constructs a new {@code StreamTokenizer} with {@code is} as source input 171 * stream. This constructor is deprecated; instead, the constructor that 172 * takes a {@code Reader} as an arugment should be used. 173 * 174 * @param is 175 * the source stream from which to parse tokens. 176 * @throws NullPointerException 177 * if {@code is} is {@code null}. 178 * @deprecated Use {@link #StreamTokenizer(Reader)} 179 * @since Android 1.0 180 */ 181 @Deprecated 182 public StreamTokenizer(InputStream is) { 183 this(); 184 if (is == null) { 185 throw new NullPointerException(); 186 } 187 inStream = is; 188 } 189 190 /** 191 * Constructs a new {@code StreamTokenizer} with {@code r} as source reader. 192 * The tokenizer's initial state is as follows: 193 * <ul> 194 * <li>All byte values 'A' through 'Z', 'a' through 'z', and '\u00A0' 195 * through '\u00FF' are considered to be alphabetic.</li> 196 * <li>All byte values '\u0000' through '\u0020' are considered to 197 * be white space. '/' is a comment character.</li> 198 * <li>Single quote '\'' and double quote '"' are string quote characters. 199 * </li> 200 * <li>Numbers are parsed.</li> 201 * <li>End of lines are considered to be white space rather than separate 202 * tokens.</li> 203 * <li>C-style and C++-style comments are not recognized.</LI> 204 * </ul> 205 * 206 * @param r 207 * the source reader from which to parse tokens. 208 * @since Android 1.0 209 */ 210 public StreamTokenizer(Reader r) { 211 this(); 212 if (r == null) { 213 throw new NullPointerException(); 214 } 215 inReader = r; 216 } 217 218 /** 219 * Specifies that the character {@code ch} shall be treated as a comment 220 * character. 221 * 222 * @param ch 223 * the character to be considered a comment character. 224 * @since Android 1.0 225 */ 226 public void commentChar(int ch) { 227 if (0 <= ch && ch < tokenTypes.length) { 228 tokenTypes[ch] = TOKEN_COMMENT; 229 } 230 } 231 232 /** 233 * Specifies whether the end of a line is significant and should be returned 234 * as {@code TT_EOF} in {@code ttype} by this tokenizer. 235 * 236 * @param flag 237 * {@code true} if EOL is significant, {@code false} otherwise. 238 * @since Android 1.0 239 */ 240 public void eolIsSignificant(boolean flag) { 241 isEOLSignificant = flag; 242 } 243 244 /** 245 * Returns the current line number. 246 * 247 * @return this tokenizer's current line number. 248 * @since Android 1.0 249 */ 250 public int lineno() { 251 return lineNumber; 252 } 253 254 /** 255 * Specifies whether word tokens should be converted to lower case when they 256 * are stored in {@code sval}. 257 * 258 * @param flag 259 * {@code true} if {@code sval} should be converted to lower 260 * case, {@code false} otherwise. 261 * @since Android 1.0 262 */ 263 public void lowerCaseMode(boolean flag) { 264 forceLowercase = flag; 265 } 266 267 /** 268 * Parses the next token from this tokenizer's source stream or reader. The 269 * type of the token is stored in the {@code ttype} field, additional 270 * information may be stored in the {@code nval} or {@code sval} fields. 271 * 272 * @return the value of {@code ttype}. 273 * @throws IOException 274 * if an I/O error occurs while parsing the next token. 275 * @since Android 1.0 276 */ 277 public int nextToken() throws IOException { 278 if (pushBackToken) { 279 pushBackToken = false; 280 if (ttype != TT_UNKNOWN) { 281 return ttype; 282 } 283 } 284 sval = null; // Always reset sval to null 285 int currentChar = peekChar == -2 ? read() : peekChar; 286 287 if (lastCr && currentChar == '\n') { 288 lastCr = false; 289 currentChar = read(); 290 } 291 if (currentChar == -1) { 292 return (ttype = TT_EOF); 293 } 294 295 byte currentType = currentChar > 255 ? TOKEN_WORD 296 : tokenTypes[currentChar]; 297 while ((currentType & TOKEN_WHITE) != 0) { 298 /** 299 * Skip over white space until we hit a new line or a real token 300 */ 301 if (currentChar == '\r') { 302 lineNumber++; 303 if (isEOLSignificant) { 304 lastCr = true; 305 peekChar = -2; 306 return (ttype = TT_EOL); 307 } 308 if ((currentChar = read()) == '\n') { 309 currentChar = read(); 310 } 311 } else if (currentChar == '\n') { 312 lineNumber++; 313 if (isEOLSignificant) { 314 peekChar = -2; 315 return (ttype = TT_EOL); 316 } 317 currentChar = read(); 318 } else { 319 // Advance over this white space character and try again. 320 currentChar = read(); 321 } 322 if (currentChar == -1) { 323 return (ttype = TT_EOF); 324 } 325 currentType = currentChar > 255 ? TOKEN_WORD 326 : tokenTypes[currentChar]; 327 } 328 329 /** 330 * Check for digits before checking for words since digits can be 331 * contained within words. 332 */ 333 if ((currentType & TOKEN_DIGIT) != 0) { 334 StringBuilder digits = new StringBuilder(20); 335 boolean haveDecimal = false, checkJustNegative = currentChar == '-'; 336 while (true) { 337 if (currentChar == '.') { 338 haveDecimal = true; 339 } 340 digits.append((char) currentChar); 341 currentChar = read(); 342 if ((currentChar < '0' || currentChar > '9') 343 && (haveDecimal || currentChar != '.')) { 344 break; 345 } 346 } 347 peekChar = currentChar; 348 if (checkJustNegative && digits.length() == 1) { 349 // Didn't get any other digits other than '-' 350 return (ttype = '-'); 351 } 352 try { 353 nval = Double.valueOf(digits.toString()).doubleValue(); 354 } catch (NumberFormatException e) { 355 // Unsure what to do, will write test. 356 nval = 0; 357 } 358 return (ttype = TT_NUMBER); 359 } 360 // Check for words 361 if ((currentType & TOKEN_WORD) != 0) { 362 StringBuffer word = new StringBuffer(20); 363 while (true) { 364 word.append((char) currentChar); 365 currentChar = read(); 366 if (currentChar == -1 367 || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) { 368 break; 369 } 370 } 371 peekChar = currentChar; 372 sval = forceLowercase ? word.toString().toLowerCase() : word 373 .toString(); 374 return (ttype = TT_WORD); 375 } 376 // Check for quoted character 377 if (currentType == TOKEN_QUOTE) { 378 int matchQuote = currentChar; 379 StringBuffer quoteString = new StringBuffer(); 380 int peekOne = read(); 381 while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r' 382 && peekOne != '\n') { 383 boolean readPeek = true; 384 if (peekOne == '\\') { 385 int c1 = read(); 386 // Check for quoted octal IE: \377 387 if (c1 <= '7' && c1 >= '0') { 388 int digitValue = c1 - '0'; 389 c1 = read(); 390 if (c1 > '7' || c1 < '0') { 391 readPeek = false; 392 } else { 393 digitValue = digitValue * 8 + (c1 - '0'); 394 c1 = read(); 395 // limit the digit value to a byte 396 if (digitValue > 037 || c1 > '7' || c1 < '0') { 397 readPeek = false; 398 } else { 399 digitValue = digitValue * 8 + (c1 - '0'); 400 } 401 } 402 if (!readPeek) { 403 // We've consumed one to many 404 quoteString.append((char) digitValue); 405 peekOne = c1; 406 } else { 407 peekOne = digitValue; 408 } 409 } else { 410 switch (c1) { 411 case 'a': 412 peekOne = 0x7; 413 break; 414 case 'b': 415 peekOne = 0x8; 416 break; 417 case 'f': 418 peekOne = 0xc; 419 break; 420 case 'n': 421 peekOne = 0xA; 422 break; 423 case 'r': 424 peekOne = 0xD; 425 break; 426 case 't': 427 peekOne = 0x9; 428 break; 429 case 'v': 430 peekOne = 0xB; 431 break; 432 default: 433 peekOne = c1; 434 } 435 } 436 } 437 if (readPeek) { 438 quoteString.append((char) peekOne); 439 peekOne = read(); 440 } 441 } 442 if (peekOne == matchQuote) { 443 peekOne = read(); 444 } 445 peekChar = peekOne; 446 ttype = matchQuote; 447 sval = quoteString.toString(); 448 return ttype; 449 } 450 // Do comments, both "//" and "/*stuff*/" 451 if (currentChar == '/' && (slashSlashComments || slashStarComments)) { 452 if ((currentChar = read()) == '*' && slashStarComments) { 453 int peekOne = read(); 454 while (true) { 455 currentChar = peekOne; 456 peekOne = read(); 457 if (currentChar == -1) { 458 peekChar = -1; 459 return (ttype = TT_EOF); 460 } 461 if (currentChar == '\r') { 462 if (peekOne == '\n') { 463 peekOne = read(); 464 } 465 lineNumber++; 466 } else if (currentChar == '\n') { 467 lineNumber++; 468 } else if (currentChar == '*' && peekOne == '/') { 469 peekChar = read(); 470 return nextToken(); 471 } 472 } 473 } else if (currentChar == '/' && slashSlashComments) { 474 // Skip to EOF or new line then return the next token 475 while ((currentChar = read()) >= 0 && currentChar != '\r' 476 && currentChar != '\n') { 477 // Intentionally empty 478 } 479 peekChar = currentChar; 480 return nextToken(); 481 } else if (currentType != TOKEN_COMMENT) { 482 // Was just a slash by itself 483 peekChar = currentChar; 484 return (ttype = '/'); 485 } 486 } 487 // Check for comment character 488 if (currentType == TOKEN_COMMENT) { 489 // Skip to EOF or new line then return the next token 490 while ((currentChar = read()) >= 0 && currentChar != '\r' 491 && currentChar != '\n') { 492 // Intentionally empty 493 } 494 peekChar = currentChar; 495 return nextToken(); 496 } 497 498 peekChar = read(); 499 return (ttype = currentChar); 500 } 501 502 /** 503 * Specifies that the character {@code ch} shall be treated as an ordinary 504 * character by this tokenizer. That is, it has no special meaning as a 505 * comment character, word component, white space, string delimiter or 506 * number. 507 * 508 * @param ch 509 * the character to be considered an ordinary character. 510 * @since Android 1.0 511 */ 512 public void ordinaryChar(int ch) { 513 if (0 <= ch && ch < tokenTypes.length) { 514 tokenTypes[ch] = 0; 515 } 516 } 517 518 /** 519 * Specifies that the characters in the range from {@code low} to {@code hi} 520 * shall be treated as an ordinary character by this tokenizer. That is, 521 * they have no special meaning as a comment character, word component, 522 * white space, string delimiter or number. 523 * 524 * @param low 525 * the first character in the range of ordinary characters. 526 * @param hi 527 * the last character in the range of ordinary characters. 528 * @since Android 1.0 529 */ 530 public void ordinaryChars(int low, int hi) { 531 if (low < 0) { 532 low = 0; 533 } 534 if (hi > tokenTypes.length) { 535 hi = tokenTypes.length - 1; 536 } 537 for (int i = low; i <= hi; i++) { 538 tokenTypes[i] = 0; 539 } 540 } 541 542 /** 543 * Specifies that this tokenizer shall parse numbers. 544 * 545 * @since Android 1.0 546 */ 547 public void parseNumbers() { 548 for (int i = '0'; i <= '9'; i++) { 549 tokenTypes[i] |= TOKEN_DIGIT; 550 } 551 tokenTypes['.'] |= TOKEN_DIGIT; 552 tokenTypes['-'] |= TOKEN_DIGIT; 553 } 554 555 /** 556 * Indicates that the current token should be pushed back and returned again 557 * the next time {@code nextToken()} is called. 558 * 559 * @since Android 1.0 560 */ 561 public void pushBack() { 562 pushBackToken = true; 563 } 564 565 /** 566 * Specifies that the character {@code ch} shall be treated as a quote 567 * character. 568 * 569 * @param ch 570 * the character to be considered a quote character. 571 * @since Android 1.0 572 */ 573 public void quoteChar(int ch) { 574 if (0 <= ch && ch < tokenTypes.length) { 575 tokenTypes[ch] = TOKEN_QUOTE; 576 } 577 } 578 579 private int read() throws IOException { 580 // Call the read for the appropriate stream 581 if (inStream == null) { 582 return inReader.read(); 583 } 584 return inStream.read(); 585 } 586 587 /** 588 * Specifies that all characters shall be treated as ordinary characters. 589 * 590 * @since Android 1.0 591 */ 592 public void resetSyntax() { 593 for (int i = 0; i < 256; i++) { 594 tokenTypes[i] = 0; 595 } 596 } 597 598 /** 599 * Specifies whether "slash-slash" (C++-style) comments shall be recognized. 600 * This kind of comment ends at the end of the line. 601 * 602 * @param flag 603 * {@code true} if {@code //} should be recognized as the start 604 * of a comment, {@code false} otherwise. 605 * @since Android 1.0 606 */ 607 public void slashSlashComments(boolean flag) { 608 slashSlashComments = flag; 609 } 610 611 /** 612 * Specifies whether "slash-star" (C-style) comments shall be recognized. 613 * Slash-star comments cannot be nested and end when a star-slash 614 * combination is found. 615 * 616 * @param flag 617 * {@code true} if {@code /*} should be recognized as the start 618 * of a comment, {@code false} otherwise. 619 * @since Android 1.0 620 */ 621 public void slashStarComments(boolean flag) { 622 slashStarComments = flag; 623 } 624 625 /** 626 * Returns the state of this tokenizer in a readable format. 627 * 628 * @return the current state of this tokenizer. 629 * @since Android 1.0 630 */ 631 @Override 632 public String toString() { 633 // Values determined through experimentation 634 StringBuilder result = new StringBuilder(); 635 result.append("Token["); //$NON-NLS-1$ 636 switch (ttype) { 637 case TT_EOF: 638 result.append("EOF"); //$NON-NLS-1$ 639 break; 640 case TT_EOL: 641 result.append("EOL"); //$NON-NLS-1$ 642 break; 643 case TT_NUMBER: 644 result.append("n="); //$NON-NLS-1$ 645 result.append(nval); 646 break; 647 case TT_WORD: 648 result.append(sval); 649 break; 650 default: 651 // BEGIN android-changed 652 // copied from a newer version of harmony 653 if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) { 654 result.append(sval); 655 } else { 656 result.append('\''); 657 result.append((char) ttype); 658 result.append('\''); 659 } 660 // END android-changed 661 } 662 result.append("], line "); //$NON-NLS-1$ 663 result.append(lineNumber); 664 return result.toString(); 665 } 666 667 /** 668 * Specifies that the characters in the range from {@code low} to {@code hi} 669 * shall be treated as whitespace characters by this tokenizer. 670 * 671 * @param low 672 * the first character in the range of whitespace characters. 673 * @param hi 674 * the last character in the range of whitespace characters. 675 * @since Android 1.0 676 */ 677 public void whitespaceChars(int low, int hi) { 678 if (low < 0) { 679 low = 0; 680 } 681 if (hi > tokenTypes.length) { 682 hi = tokenTypes.length - 1; 683 } 684 for (int i = low; i <= hi; i++) { 685 tokenTypes[i] = TOKEN_WHITE; 686 } 687 } 688 689 /** 690 * Specifies that the characters in the range from {@code low} to {@code hi} 691 * shall be treated as word characters by this tokenizer. A word consists of 692 * a word character followed by zero or more word or number characters. 693 * 694 * @param low 695 * the first character in the range of word characters. 696 * @param hi 697 * the last character in the range of word characters. 698 * @since Android 1.0 699 */ 700 public void wordChars(int low, int hi) { 701 if (low < 0) { 702 low = 0; 703 } 704 if (hi > tokenTypes.length) { 705 hi = tokenTypes.length - 1; 706 } 707 for (int i = low; i <= hi; i++) { 708 tokenTypes[i] |= TOKEN_WORD; 709 } 710 } 711} 712