1/** 2******************************************************************************* 3* Copyright (C) 1996-2016, International Business Machines Corporation and * 4* others. All Rights Reserved. * 5******************************************************************************* 6*/ 7 8package com.ibm.icu.util; 9 10import java.util.Enumeration; 11import java.util.NoSuchElementException; 12 13import com.ibm.icu.text.UTF16; 14import com.ibm.icu.text.UnicodeSet; 15 16/** 17 * {@icuenhanced java.util.Calendar}.{@icu _usage_} 18 * 19 * <p>The string tokenizer class allows an application to break a string 20 * into tokens by performing code point comparison. 21 * The <code>StringTokenizer</code> methods do not distinguish 22 * among identifiers, numbers, and quoted strings, nor do they recognize 23 * and skip comments. 24 * <p> 25 * The set of delimiters (the codepoints that separate tokens) may be 26 * specified either at creation time or on a per-token basis. 27 * <p> 28 * An instance of <code>StringTokenizer</code> behaves in one of three ways, 29 * depending on whether it was created with the <code>returnDelims</code> 30 * and <code>coalesceDelims</code> 31 * flags having the value <code>true</code> or <code>false</code>: 32 * <ul> 33 * <li>If returnDelims is <code>false</code>, delimiter code points serve to 34 * separate tokens. A token is a maximal sequence of consecutive 35 * code points that are not delimiters. 36 * <li>If returnDelims is <code>true</code>, delimiter code points are 37 * themselves considered to be tokens. In this case, if coalesceDelims is 38 * <code>true</code>, such tokens will be the maximal sequence of consecutive 39 * code points that <em>are</em> delimiters. If coalesceDelims is false, 40 * a token will be received for each delimiter code point. 41 * </ul> 42 * <p>A token is thus either one 43 * delimiter code point, a maximal sequence of consecutive code points that 44 * are delimiters, or a maximal sequence of consecutive code 45 * points that are not delimiters. 46 * <p> 47 * A <tt>StringTokenizer</tt> object internally maintains a current 48 * position within the string to be tokenized. Some operations advance this 49 * current position past the code point processed. 50 * <p> 51 * A token is returned by taking a substring of the string that was used to 52 * create the <tt>StringTokenizer</tt> object. 53 * <p> 54 * Example of the use of the default delimiter tokenizer. 55 * <blockquote><pre> 56 * StringTokenizer st = new StringTokenizer("this is a test"); 57 * while (st.hasMoreTokens()) { 58 * println(st.nextToken()); 59 * } 60 * </pre></blockquote> 61 * <p> 62 * prints the following output: 63 * <blockquote><pre> 64 * this 65 * is 66 * a 67 * test 68 * </pre></blockquote> 69 * <p> 70 * Example of the use of the tokenizer with user specified delimiter. 71 * <blockquote><pre> 72 * StringTokenizer st = new StringTokenizer( 73 * "this is a test with supplementary characters \ud800\ud800\udc00\udc00", 74 * " \ud800\udc00"); 75 * while (st.hasMoreTokens()) { 76 * println(st.nextToken()); 77 * } 78 * </pre></blockquote> 79 * <p> 80 * prints the following output: 81 * <blockquote><pre> 82 * this 83 * is 84 * a 85 * test 86 * with 87 * supplementary 88 * characters 89 * \ud800 90 * \udc00 91 * </pre></blockquote> 92 * 93 * @author syn wee 94 * @stable ICU 2.4 95 */ 96public final class StringTokenizer implements Enumeration<Object> 97{ 98 // public constructors --------------------------------------------- 99 100 /** 101 * {@icu} Constructs a string tokenizer for the specified string. All 102 * characters in the delim argument are the delimiters for separating 103 * tokens. 104 * <p>If the returnDelims flag is false, the delimiter characters are 105 * skipped and only serve as separators between tokens. 106 * <p>If the returnDelims flag is true, then the delimiter characters 107 * are also returned as tokens, one per delimiter. 108 * @param str a string to be parsed. 109 * @param delim the delimiters. 110 * @param returndelims flag indicating whether to return the delimiters 111 * as tokens. 112 * @exception NullPointerException if str is null 113 * @stable ICU 2.4 114 */ 115 public StringTokenizer(String str, UnicodeSet delim, boolean returndelims) 116 { 117 this(str, delim, returndelims, false); 118 } 119 120 /** 121 * {@icu} Constructs a string tokenizer for the specified string. All 122 * characters in the delim argument are the delimiters for separating 123 * tokens. 124 * <p>If the returnDelims flag is false, the delimiter characters are 125 * skipped and only serve as separators between tokens. 126 * <p>If the returnDelims flag is true, then the delimiter characters 127 * are also returned as tokens. If coalescedelims is true, one token 128 * is returned for each run of delimiter characters, otherwise one 129 * token is returned per delimiter. Since surrogate pairs can be 130 * delimiters, the returned token might be two chars in length. 131 * @param str a string to be parsed. 132 * @param delim the delimiters. 133 * @param returndelims flag indicating whether to return the delimiters 134 * as tokens. 135 * @param coalescedelims flag indicating whether to return a run of 136 * delimiters as a single token or as one token per delimiter. 137 * This only takes effect if returndelims is true. 138 * @exception NullPointerException if str is null 139 * @internal 140 * @deprecated This API is ICU internal only. 141 */ 142 @Deprecated 143 public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims) 144 { 145 m_source_ = str; 146 m_length_ = str.length(); 147 if (delim == null) { 148 m_delimiters_ = EMPTY_DELIMITER_; 149 } 150 else { 151 m_delimiters_ = delim; 152 } 153 m_returnDelimiters_ = returndelims; 154 m_coalesceDelimiters_ = coalescedelims; 155 m_tokenOffset_ = -1; 156 m_tokenSize_ = -1; 157 if (m_length_ == 0) { 158 // string length 0, no tokens 159 m_nextOffset_ = -1; 160 } 161 else { 162 m_nextOffset_ = 0; 163 if (!returndelims) { 164 m_nextOffset_ = getNextNonDelimiter(0); 165 } 166 } 167 } 168 169 /** 170 * {@icu} Constructs a string tokenizer for the specified string. The 171 * characters in the delim argument are the delimiters for separating 172 * tokens. 173 * <p>Delimiter characters themselves will not be treated as tokens. 174 * @param str a string to be parsed. 175 * @param delim the delimiters. 176 * @exception NullPointerException if str is null 177 * @stable ICU 2.4 178 */ 179 public StringTokenizer(String str, UnicodeSet delim) 180 { 181 this(str, delim, false, false); 182 } 183 184 /** 185 * <p>Constructs a string tokenizer for the specified string. All 186 * characters in the delim argument are the delimiters for separating 187 * tokens. 188 * <p>If the returnDelims flag is false, the delimiter characters are 189 * skipped and only serve as separators between tokens. 190 * <p>If the returnDelims flag is true, then the delimiter characters 191 * are also returned as tokens, one per delimiter. 192 * @param str a string to be parsed. 193 * @param delim the delimiters. 194 * @param returndelims flag indicating whether to return the delimiters 195 * as tokens. 196 * @exception NullPointerException if str is null 197 * @stable ICU 2.4 198 */ 199 public StringTokenizer(String str, String delim, boolean returndelims) 200 { 201 this(str, delim, returndelims, false); // java default behavior 202 } 203 204 /** 205 * <p>Constructs a string tokenizer for the specified string. All 206 * characters in the delim argument are the delimiters for separating 207 * tokens. 208 * <p>If the returnDelims flag is false, the delimiter characters are 209 * skipped and only serve as separators between tokens. 210 * <p>If the returnDelims flag is true, then the delimiter characters 211 * are also returned as tokens. If coalescedelims is true, one token 212 * is returned for each run of delimiter characters, otherwise one 213 * token is returned per delimiter. Since surrogate pairs can be 214 * delimiters, the returned token might be two chars in length. 215 * @param str a string to be parsed. 216 * @param delim the delimiters. 217 * @param returndelims flag indicating whether to return the delimiters 218 * as tokens. 219 * @param coalescedelims flag indicating whether to return a run of 220 * delimiters as a single token or as one token per delimiter. 221 * This only takes effect if returndelims is true. 222 * @exception NullPointerException if str is null 223 * @internal 224 * @deprecated This API is ICU internal only. 225 */ 226 @Deprecated 227 public StringTokenizer(String str, String delim, boolean returndelims, boolean coalescedelims) 228 { 229 // don't ignore whitespace 230 m_delimiters_ = EMPTY_DELIMITER_; 231 if (delim != null && delim.length() > 0) { 232 m_delimiters_ = new UnicodeSet(); 233 m_delimiters_.addAll(delim); 234 checkDelimiters(); 235 } 236 m_coalesceDelimiters_ = coalescedelims; 237 m_source_ = str; 238 m_length_ = str.length(); 239 m_returnDelimiters_ = returndelims; 240 m_tokenOffset_ = -1; 241 m_tokenSize_ = -1; 242 if (m_length_ == 0) { 243 // string length 0, no tokens 244 m_nextOffset_ = -1; 245 } 246 else { 247 m_nextOffset_ = 0; 248 if (!returndelims) { 249 m_nextOffset_ = getNextNonDelimiter(0); 250 } 251 } 252 } 253 254 /** 255 * <p>Constructs a string tokenizer for the specified string. The 256 * characters in the delim argument are the delimiters for separating 257 * tokens. 258 * <p>Delimiter characters themselves will not be treated as tokens. 259 * @param str a string to be parsed. 260 * @param delim the delimiters. 261 * @exception NullPointerException if str is null 262 * @stable ICU 2.4 263 */ 264 public StringTokenizer(String str, String delim) 265 { 266 // don't ignore whitespace 267 this(str, delim, false, false); 268 } 269 270 /** 271 * <p>Constructs a string tokenizer for the specified string. 272 * The tokenizer uses the default delimiter set, which is 273 * " \t\n\r\f": 274 * the space character, the tab character, the newline character, the 275 * carriage-return character, and the form-feed character. 276 * <p>Delimiter characters themselves will not be treated as tokens. 277 * @param str a string to be parsed 278 * @exception NullPointerException if str is null 279 * @stable ICU 2.4 280 */ 281 public StringTokenizer(String str) 282 { 283 this(str, DEFAULT_DELIMITERS_, false, false); 284 } 285 286 // public methods -------------------------------------------------- 287 288 /** 289 * Tests if there are more tokens available from this tokenizer's 290 * string. 291 * If this method returns <tt>true</tt>, then a subsequent call to 292 * <tt>nextToken</tt> with no argument will successfully return a token. 293 * @return <code>true</code> if and only if there is at least one token 294 * in the string after the current position; <code>false</code> 295 * otherwise. 296 * @stable ICU 2.4 297 */ 298 public boolean hasMoreTokens() 299 { 300 return m_nextOffset_ >= 0; 301 } 302 303 /** 304 * Returns the next token from this string tokenizer. 305 * @return the next token from this string tokenizer. 306 * @exception NoSuchElementException if there are no more tokens in 307 * this tokenizer's string. 308 * @stable ICU 2.4 309 */ 310 public String nextToken() 311 { 312 if (m_tokenOffset_ < 0) { 313 if (m_nextOffset_ < 0) { 314 throw new NoSuchElementException("No more tokens in String"); 315 } 316 // pre-calculations of tokens not done 317 if (m_returnDelimiters_) { 318 int tokenlimit = 0; 319 int c = UTF16.charAt(m_source_, m_nextOffset_); 320 boolean contains = delims == null 321 ? m_delimiters_.contains(c) 322 : c < delims.length && delims[c]; 323 if (contains) { 324 if (m_coalesceDelimiters_) { 325 tokenlimit = getNextNonDelimiter(m_nextOffset_); 326 } else { 327 tokenlimit = m_nextOffset_ + UTF16.getCharCount(c); 328 if (tokenlimit == m_length_) { 329 tokenlimit = -1; 330 } 331 } 332 } 333 else { 334 tokenlimit = getNextDelimiter(m_nextOffset_); 335 } 336 String result; 337 if (tokenlimit < 0) { 338 result = m_source_.substring(m_nextOffset_); 339 } 340 else { 341 result = m_source_.substring(m_nextOffset_, tokenlimit); 342 } 343 m_nextOffset_ = tokenlimit; 344 return result; 345 } 346 else { 347 int tokenlimit = getNextDelimiter(m_nextOffset_); 348 String result; 349 if (tokenlimit < 0) { 350 result = m_source_.substring(m_nextOffset_); 351 m_nextOffset_ = tokenlimit; 352 } 353 else { 354 result = m_source_.substring(m_nextOffset_, tokenlimit); 355 m_nextOffset_ = getNextNonDelimiter(tokenlimit); 356 } 357 358 return result; 359 } 360 } 361 // count was called before and we have all the tokens 362 if (m_tokenOffset_ >= m_tokenSize_) { 363 throw new NoSuchElementException("No more tokens in String"); 364 } 365 String result; 366 if (m_tokenLimit_[m_tokenOffset_] >= 0) { 367 result = m_source_.substring(m_tokenStart_[m_tokenOffset_], 368 m_tokenLimit_[m_tokenOffset_]); 369 } 370 else { 371 result = m_source_.substring(m_tokenStart_[m_tokenOffset_]); 372 } 373 m_tokenOffset_ ++; 374 m_nextOffset_ = -1; 375 if (m_tokenOffset_ < m_tokenSize_) { 376 m_nextOffset_ = m_tokenStart_[m_tokenOffset_]; 377 } 378 return result; 379 } 380 381 /** 382 * Returns the next token in this string tokenizer's string. First, 383 * the set of characters considered to be delimiters by this 384 * <tt>StringTokenizer</tt> object is changed to be the characters in 385 * the string <tt>delim</tt>. Then the next token in the string 386 * after the current position is returned. The current position is 387 * advanced beyond the recognized token. The new delimiter set 388 * remains the default after this call. 389 * @param delim the new delimiters. 390 * @return the next token, after switching to the new delimiter set. 391 * @exception NoSuchElementException if there are no more tokens in 392 * this tokenizer's string. 393 * @stable ICU 2.4 394 */ 395 public String nextToken(String delim) 396 { 397 m_delimiters_ = EMPTY_DELIMITER_; 398 if (delim != null && delim.length() > 0) { 399 m_delimiters_ = new UnicodeSet(); 400 m_delimiters_.addAll(delim); 401 } 402 return nextToken(m_delimiters_); 403 } 404 405 /** 406 * {@icu} Returns the next token in this string tokenizer's string. First, 407 * the set of characters considered to be delimiters by this 408 * <tt>StringTokenizer</tt> object is changed to be the characters in 409 * the string <tt>delim</tt>. Then the next token in the string 410 * after the current position is returned. The current position is 411 * advanced beyond the recognized token. The new delimiter set 412 * remains the default after this call. 413 * @param delim the new delimiters. 414 * @return the next token, after switching to the new delimiter set. 415 * @exception NoSuchElementException if there are no more tokens in 416 * this tokenizer's string. 417 * @stable ICU 2.4 418 */ 419 public String nextToken(UnicodeSet delim) 420 { 421 m_delimiters_ = delim; 422 checkDelimiters(); 423 m_tokenOffset_ = -1; 424 m_tokenSize_ = -1; 425 if (!m_returnDelimiters_) { 426 m_nextOffset_ = getNextNonDelimiter(m_nextOffset_); 427 } 428 return nextToken(); 429 } 430 431 /** 432 * Returns the same value as the <code>hasMoreTokens</code> method. 433 * It exists so that this class can implement the 434 * <code>Enumeration</code> interface. 435 * @return <code>true</code> if there are more tokens; 436 * <code>false</code> otherwise. 437 * @see #hasMoreTokens() 438 * @stable ICU 2.4 439 */ 440 public boolean hasMoreElements() 441 { 442 return hasMoreTokens(); 443 } 444 445 /** 446 * Returns the same value as the <code>nextToken</code> method, except 447 * that its declared return value is <code>Object</code> rather than 448 * <code>String</code>. It exists so that this class can implement the 449 * <code>Enumeration</code> interface. 450 * @return the next token in the string. 451 * @exception NoSuchElementException if there are no more tokens in 452 * this tokenizer's string. 453 * @see #nextToken() 454 * @stable ICU 2.4 455 */ 456 public Object nextElement() 457 { 458 return nextToken(); 459 } 460 461 /** 462 * Calculates the number of times that this tokenizer's 463 * <code>nextToken</code> method can be called before it generates an 464 * exception. The current position is not advanced. 465 * @return the number of tokens remaining in the string using the 466 * current delimiter set. 467 * @see #nextToken() 468 * @stable ICU 2.4 469 */ 470 public int countTokens() 471 { 472 int result = 0; 473 if (hasMoreTokens()) { 474 if (m_tokenOffset_ >= 0) { 475 return m_tokenSize_ - m_tokenOffset_; 476 } 477 if (m_tokenStart_ == null) { 478 m_tokenStart_ = new int[TOKEN_SIZE_]; 479 m_tokenLimit_ = new int[TOKEN_SIZE_]; 480 } 481 do { 482 if (m_tokenStart_.length == result) { 483 int temptokenindex[] = m_tokenStart_; 484 int temptokensize[] = m_tokenLimit_; 485 int originalsize = temptokenindex.length; 486 int newsize = originalsize + TOKEN_SIZE_; 487 m_tokenStart_ = new int[newsize]; 488 m_tokenLimit_ = new int[newsize]; 489 System.arraycopy(temptokenindex, 0, m_tokenStart_, 0, 490 originalsize); 491 System.arraycopy(temptokensize, 0, m_tokenLimit_, 0, 492 originalsize); 493 } 494 m_tokenStart_[result] = m_nextOffset_; 495 if (m_returnDelimiters_) { 496 int c = UTF16.charAt(m_source_, m_nextOffset_); 497 boolean contains = delims == null 498 ? m_delimiters_.contains(c) 499 : c < delims.length && delims[c]; 500 if (contains) { 501 if (m_coalesceDelimiters_) { 502 m_tokenLimit_[result] = getNextNonDelimiter( 503 m_nextOffset_); 504 } else { 505 int p = m_nextOffset_ + 1; 506 if (p == m_length_) { 507 p = -1; 508 } 509 m_tokenLimit_[result] = p; 510 511 } 512 } 513 else { 514 m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_); 515 } 516 m_nextOffset_ = m_tokenLimit_[result]; 517 } 518 else { 519 m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_); 520 m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]); 521 } 522 result ++; 523 } while (m_nextOffset_ >= 0); 524 m_tokenOffset_ = 0; 525 m_tokenSize_ = result; 526 m_nextOffset_ = m_tokenStart_[0]; 527 } 528 return result; 529 } 530 531 // private data members ------------------------------------------------- 532 533 /** 534 * Current offset to the token array. If the array token is not set up yet, 535 * this value is a -1 536 */ 537 private int m_tokenOffset_; 538 /** 539 * Size of the token array. If the array token is not set up yet, 540 * this value is a -1 541 */ 542 private int m_tokenSize_; 543 /** 544 * Array of pre-calculated tokens start indexes in source string terminated 545 * by -1. 546 * This is only set up during countTokens() and only stores the remaining 547 * tokens, not all tokens including parsed ones 548 */ 549 private int m_tokenStart_[]; 550 /** 551 * Array of pre-calculated tokens limit indexes in source string. 552 * This is only set up during countTokens() and only stores the remaining 553 * tokens, not all tokens including parsed ones 554 */ 555 private int m_tokenLimit_[]; 556 /** 557 * UnicodeSet containing delimiters 558 */ 559 private UnicodeSet m_delimiters_; 560 /** 561 * String to parse for tokens 562 */ 563 private String m_source_; 564 /** 565 * Length of m_source_ 566 */ 567 private int m_length_; 568 /** 569 * Current position in string to parse for tokens 570 */ 571 private int m_nextOffset_; 572 /** 573 * Flag indicator if delimiters are to be treated as tokens too 574 */ 575 private boolean m_returnDelimiters_; 576 577 /** 578 * Flag indicating whether to coalesce runs of delimiters into single tokens 579 */ 580 private boolean m_coalesceDelimiters_; 581 582 /** 583 * Default set of delimiters \t\n\r\f 584 */ 585 private static final UnicodeSet DEFAULT_DELIMITERS_ 586 = new UnicodeSet(0x09, 0x0a, 0x0c, 0x0d, 0x20, 0x20); // UnicodeSet("[ \t\n\r\f]", false) 587 /** 588 * Array size increments 589 */ 590 private static final int TOKEN_SIZE_ = 100; 591 /** 592 * A empty delimiter UnicodeSet, used when user specified null delimiters 593 */ 594 private static final UnicodeSet EMPTY_DELIMITER_ = UnicodeSet.EMPTY; 595 596 // private methods ------------------------------------------------------ 597 598 /** 599 * Gets the index of the next delimiter after offset 600 * @param offset to the source string 601 * @return offset of the immediate next delimiter, otherwise 602 * (- source string length - 1) if there 603 * are no more delimiters after m_nextOffset 604 */ 605 private int getNextDelimiter(int offset) 606 { 607 if (offset >= 0) { 608 int result = offset; 609 int c = 0; 610 if (delims == null) { 611 do { 612 c = UTF16.charAt(m_source_, result); 613 if (m_delimiters_.contains(c)) { 614 break; 615 } 616 result ++; 617 } while (result < m_length_); 618 } else { 619 do { 620 c = UTF16.charAt(m_source_, result); 621 if (c < delims.length && delims[c]) { 622 break; 623 } 624 result ++; 625 } while (result < m_length_); 626 } 627 if (result < m_length_) { 628 return result; 629 } 630 } 631 return -1 - m_length_; 632 } 633 634 /** 635 * Gets the index of the next non-delimiter after m_nextOffset_ 636 * @param offset to the source string 637 * @return offset of the immediate next non-delimiter, otherwise 638 * (- source string length - 1) if there 639 * are no more delimiters after m_nextOffset 640 */ 641 private int getNextNonDelimiter(int offset) 642 { 643 if (offset >= 0) { 644 int result = offset; 645 int c = 0; 646 if (delims == null) { 647 do { 648 c = UTF16.charAt(m_source_, result); 649 if (!m_delimiters_.contains(c)) { 650 break; 651 } 652 result ++; 653 } while (result < m_length_); 654 } else { 655 do { 656 c = UTF16.charAt(m_source_, result); 657 if (!(c < delims.length && delims[c])) { 658 break; 659 } 660 result ++; 661 } while (result < m_length_); 662 } 663 if (result < m_length_) { 664 return result; 665 } 666 } 667 return -1 - m_length_; 668 } 669 670 void checkDelimiters() { 671 if (m_delimiters_ == null || m_delimiters_.size() == 0) { 672 delims = new boolean[0]; 673 } else { 674 int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1); 675 if (maxChar < 0x7f) { 676 delims = new boolean[maxChar+1]; 677 for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) { 678 delims[ch] = true; 679 } 680 } else { 681 delims = null; 682 } 683 } 684 } 685 private boolean[] delims; 686} 687