1/**
2*******************************************************************************
3* Copyright (C) 1996-2016, International Business Machines Corporation and    *
4* others. All Rights Reserved.                                                *
5*******************************************************************************
6*/
7
8package com.ibm.icu.util;
9
10import java.util.Enumeration;
11import java.util.NoSuchElementException;
12
13import com.ibm.icu.text.UTF16;
14import com.ibm.icu.text.UnicodeSet;
15
16/**
17 * {@icuenhanced java.util.Calendar}.{@icu _usage_}
18 *
19 * <p>The string tokenizer class allows an application to break a string
20 * into tokens by performing code point comparison.
21 * The <code>StringTokenizer</code> methods do not distinguish
22 * among identifiers, numbers, and quoted strings, nor do they recognize
23 * and skip comments.
24 * <p>
25 * The set of delimiters (the codepoints that separate tokens) may be
26 * specified either at creation time or on a per-token basis.
27 * <p>
28 * An instance of <code>StringTokenizer</code> behaves in one of three ways,
29 * depending on whether it was created with the <code>returnDelims</code>
30 * and <code>coalesceDelims</code>
31 * flags having the value <code>true</code> or <code>false</code>:
32 * <ul>
33 * <li>If returnDelims is <code>false</code>, delimiter code points serve to
34 * separate tokens. A token is a maximal sequence of consecutive
35 * code points that are not delimiters.
36 * <li>If returnDelims is <code>true</code>, delimiter code points are
37 * themselves considered to be tokens. In this case, if coalesceDelims is
38 * <code>true</code>, such tokens will be the maximal sequence of consecutive
39 * code points that <em>are</em> delimiters.  If coalesceDelims is false,
40 * a token will be received for each delimiter code point.
41 * </ul>
42 * <p>A token is thus either one
43 * delimiter code point, a maximal sequence of consecutive code points that
44 * are delimiters, or a maximal sequence of consecutive code
45 * points that are not delimiters.
46 * <p>
47 * A <tt>StringTokenizer</tt> object internally maintains a current
48 * position within the string to be tokenized. Some operations advance this
49 * current position past the code point processed.
50 * <p>
51 * A token is returned by taking a substring of the string that was used to
52 * create the <tt>StringTokenizer</tt> object.
53 * <p>
54 * Example of the use of the default delimiter tokenizer.
55 * <blockquote><pre>
56 * StringTokenizer st = new StringTokenizer("this is a test");
57 * while (st.hasMoreTokens()) {
58 *     println(st.nextToken());
59 *     }
60 * </pre></blockquote>
61 * <p>
62 * prints the following output:
63 * <blockquote><pre>
64 *     this
65 *     is
66 *     a
67 *     test
68 * </pre></blockquote>
69 * <p>
70 * Example of the use of the tokenizer with user specified delimiter.
71 * <blockquote><pre>
72 *     StringTokenizer st = new StringTokenizer(
73 *     "this is a test with supplementary characters &#92;ud800&#92;ud800&#92;udc00&#92;udc00",
74 *         " &#92;ud800&#92;udc00");
75 *     while (st.hasMoreTokens()) {
76 *         println(st.nextToken());
77 *     }
78 * </pre></blockquote>
79 * <p>
80 * prints the following output:
81 * <blockquote><pre>
82 *     this
83 *     is
84 *     a
85 *     test
86 *     with
87 *     supplementary
88 *     characters
89 *     &#92;ud800
90 *     &#92;udc00
91 * </pre></blockquote>
92 *
93 * @author syn wee
94 * @stable ICU 2.4
95 */
96public final class StringTokenizer implements Enumeration<Object>
97{
98    // public constructors ---------------------------------------------
99
100    /**
101     * {@icu} Constructs a string tokenizer for the specified string. All
102     * characters in the delim argument are the delimiters for separating
103     * tokens.
104     * <p>If the returnDelims flag is false, the delimiter characters are
105     * skipped and only serve as separators between tokens.
106     * <p>If the returnDelims flag is true, then the delimiter characters
107     * are also returned as tokens, one per delimiter.
108     * @param str a string to be parsed.
109     * @param delim the delimiters.
110     * @param returndelims flag indicating whether to return the delimiters
111     *        as tokens.
112     * @exception NullPointerException if str is null
113     * @stable ICU 2.4
114     */
115    public StringTokenizer(String str, UnicodeSet delim, boolean returndelims)
116    {
117        this(str, delim, returndelims, false);
118    }
119
120    /**
121     * {@icu} Constructs a string tokenizer for the specified string. All
122     * characters in the delim argument are the delimiters for separating
123     * tokens.
124     * <p>If the returnDelims flag is false, the delimiter characters are
125     * skipped and only serve as separators between tokens.
126     * <p>If the returnDelims flag is true, then the delimiter characters
127     * are also returned as tokens.  If coalescedelims is true, one token
128     * is returned for each run of delimiter characters, otherwise one
129     * token is returned per delimiter.  Since surrogate pairs can be
130     * delimiters, the returned token might be two chars in length.
131     * @param str a string to be parsed.
132     * @param delim the delimiters.
133     * @param returndelims flag indicating whether to return the delimiters
134     *        as tokens.
135     * @param coalescedelims flag indicating whether to return a run of
136     *        delimiters as a single token or as one token per delimiter.
137     *        This only takes effect if returndelims is true.
138     * @exception NullPointerException if str is null
139     * @internal
140     * @deprecated This API is ICU internal only.
141     */
142    @Deprecated
143    public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
144    {
145        m_source_ = str;
146        m_length_ = str.length();
147        if (delim == null) {
148            m_delimiters_ = EMPTY_DELIMITER_;
149        }
150        else {
151            m_delimiters_ = delim;
152        }
153        m_returnDelimiters_ = returndelims;
154        m_coalesceDelimiters_ = coalescedelims;
155        m_tokenOffset_ = -1;
156        m_tokenSize_ = -1;
157        if (m_length_ == 0) {
158            // string length 0, no tokens
159            m_nextOffset_ = -1;
160        }
161        else {
162            m_nextOffset_ = 0;
163            if (!returndelims) {
164                m_nextOffset_ = getNextNonDelimiter(0);
165            }
166        }
167    }
168
169    /**
170     * {@icu} Constructs a string tokenizer for the specified string. The
171     * characters in the delim argument are the delimiters for separating
172     * tokens.
173     * <p>Delimiter characters themselves will not be treated as tokens.
174     * @param str a string to be parsed.
175     * @param delim the delimiters.
176     * @exception NullPointerException if str is null
177     * @stable ICU 2.4
178     */
179    public StringTokenizer(String str, UnicodeSet delim)
180    {
181        this(str, delim, false, false);
182    }
183
184    /**
185     * <p>Constructs a string tokenizer for the specified string. All
186     * characters in the delim argument are the delimiters for separating
187     * tokens.
188     * <p>If the returnDelims flag is false, the delimiter characters are
189     * skipped and only serve as separators between tokens.
190     * <p>If the returnDelims flag is true, then the delimiter characters
191     * are also returned as tokens, one per delimiter.
192     * @param str a string to be parsed.
193     * @param delim the delimiters.
194     * @param returndelims flag indicating whether to return the delimiters
195     *        as tokens.
196     * @exception NullPointerException if str is null
197     * @stable ICU 2.4
198     */
199    public StringTokenizer(String str, String delim, boolean returndelims)
200    {
201        this(str, delim, returndelims, false); // java default behavior
202    }
203
204    /**
205     * <p>Constructs a string tokenizer for the specified string. All
206     * characters in the delim argument are the delimiters for separating
207     * tokens.
208     * <p>If the returnDelims flag is false, the delimiter characters are
209     * skipped and only serve as separators between tokens.
210     * <p>If the returnDelims flag is true, then the delimiter characters
211     * are also returned as tokens.  If coalescedelims is true, one token
212     * is returned for each run of delimiter characters, otherwise one
213     * token is returned per delimiter.  Since surrogate pairs can be
214     * delimiters, the returned token might be two chars in length.
215     * @param str a string to be parsed.
216     * @param delim the delimiters.
217     * @param returndelims flag indicating whether to return the delimiters
218     *        as tokens.
219     * @param coalescedelims flag indicating whether to return a run of
220     *        delimiters as a single token or as one token per delimiter.
221     *        This only takes effect if returndelims is true.
222     * @exception NullPointerException if str is null
223     * @internal
224     * @deprecated This API is ICU internal only.
225     */
226    @Deprecated
227    public StringTokenizer(String str, String delim, boolean returndelims, boolean coalescedelims)
228    {
229        // don't ignore whitespace
230        m_delimiters_ = EMPTY_DELIMITER_;
231        if (delim != null && delim.length() > 0) {
232            m_delimiters_ = new UnicodeSet();
233            m_delimiters_.addAll(delim);
234            checkDelimiters();
235        }
236        m_coalesceDelimiters_ = coalescedelims;
237        m_source_ = str;
238        m_length_ = str.length();
239        m_returnDelimiters_ = returndelims;
240        m_tokenOffset_ = -1;
241        m_tokenSize_ = -1;
242        if (m_length_ == 0) {
243            // string length 0, no tokens
244            m_nextOffset_ = -1;
245        }
246        else {
247            m_nextOffset_ = 0;
248            if (!returndelims) {
249                m_nextOffset_ = getNextNonDelimiter(0);
250            }
251        }
252    }
253
254    /**
255     * <p>Constructs a string tokenizer for the specified string. The
256     * characters in the delim argument are the delimiters for separating
257     * tokens.
258     * <p>Delimiter characters themselves will not be treated as tokens.
259     * @param str a string to be parsed.
260     * @param delim the delimiters.
261     * @exception NullPointerException if str is null
262     * @stable ICU 2.4
263     */
264    public StringTokenizer(String str, String delim)
265    {
266        // don't ignore whitespace
267        this(str, delim, false, false);
268    }
269
270    /**
271     * <p>Constructs a string tokenizer for the specified string.
272     * The tokenizer uses the default delimiter set, which is
273     * " &#92;t&#92;n&#92;r&#92;f":
274     * the space character, the tab character, the newline character, the
275     * carriage-return character, and the form-feed character.
276     * <p>Delimiter characters themselves will not be treated as tokens.
277     * @param str a string to be parsed
278     * @exception NullPointerException if str is null
279     * @stable ICU 2.4
280     */
281    public StringTokenizer(String str)
282    {
283        this(str, DEFAULT_DELIMITERS_, false, false);
284    }
285
286    // public methods --------------------------------------------------
287
288    /**
289     * Tests if there are more tokens available from this tokenizer's
290     * string.
291     * If this method returns <tt>true</tt>, then a subsequent call to
292     * <tt>nextToken</tt> with no argument will successfully return a token.
293     * @return <code>true</code> if and only if there is at least one token
294     *         in the string after the current position; <code>false</code>
295     *         otherwise.
296     * @stable ICU 2.4
297     */
298    public boolean hasMoreTokens()
299    {
300        return m_nextOffset_ >= 0;
301    }
302
303    /**
304     * Returns the next token from this string tokenizer.
305     * @return the next token from this string tokenizer.
306     * @exception NoSuchElementException if there are no more tokens in
307     *            this tokenizer's string.
308     * @stable ICU 2.4
309     */
310    public String nextToken()
311    {
312        if (m_tokenOffset_ < 0) {
313            if (m_nextOffset_ < 0) {
314                throw new NoSuchElementException("No more tokens in String");
315            }
316            // pre-calculations of tokens not done
317            if (m_returnDelimiters_) {
318                int tokenlimit = 0;
319                int c = UTF16.charAt(m_source_, m_nextOffset_);
320                boolean contains = delims == null
321                    ? m_delimiters_.contains(c)
322                    : c < delims.length && delims[c];
323                if (contains) {
324                     if (m_coalesceDelimiters_) {
325                        tokenlimit = getNextNonDelimiter(m_nextOffset_);
326                     } else {
327                        tokenlimit = m_nextOffset_ + UTF16.getCharCount(c);
328                        if (tokenlimit == m_length_) {
329                            tokenlimit = -1;
330                        }
331                     }
332                }
333                else {
334                    tokenlimit = getNextDelimiter(m_nextOffset_);
335                }
336                String result;
337                if (tokenlimit < 0) {
338                    result = m_source_.substring(m_nextOffset_);
339                }
340                else {
341                    result = m_source_.substring(m_nextOffset_, tokenlimit);
342                }
343                m_nextOffset_ = tokenlimit;
344                return result;
345            }
346            else {
347                int tokenlimit = getNextDelimiter(m_nextOffset_);
348                String result;
349                if (tokenlimit < 0) {
350                    result = m_source_.substring(m_nextOffset_);
351                    m_nextOffset_ = tokenlimit;
352                }
353                else {
354                    result = m_source_.substring(m_nextOffset_, tokenlimit);
355                    m_nextOffset_ = getNextNonDelimiter(tokenlimit);
356                }
357
358                return result;
359            }
360        }
361        // count was called before and we have all the tokens
362        if (m_tokenOffset_ >= m_tokenSize_) {
363            throw new NoSuchElementException("No more tokens in String");
364        }
365        String result;
366        if (m_tokenLimit_[m_tokenOffset_] >= 0) {
367            result = m_source_.substring(m_tokenStart_[m_tokenOffset_],
368                                         m_tokenLimit_[m_tokenOffset_]);
369        }
370        else {
371            result = m_source_.substring(m_tokenStart_[m_tokenOffset_]);
372        }
373        m_tokenOffset_ ++;
374        m_nextOffset_ = -1;
375        if (m_tokenOffset_ < m_tokenSize_) {
376            m_nextOffset_ = m_tokenStart_[m_tokenOffset_];
377        }
378        return result;
379    }
380
381    /**
382     * Returns the next token in this string tokenizer's string. First,
383     * the set of characters considered to be delimiters by this
384     * <tt>StringTokenizer</tt> object is changed to be the characters in
385     * the string <tt>delim</tt>. Then the next token in the string
386     * after the current position is returned. The current position is
387     * advanced beyond the recognized token.  The new delimiter set
388     * remains the default after this call.
389     * @param delim the new delimiters.
390     * @return the next token, after switching to the new delimiter set.
391     * @exception NoSuchElementException if there are no more tokens in
392     *            this tokenizer's string.
393     * @stable ICU 2.4
394     */
395    public String nextToken(String delim)
396    {
397        m_delimiters_ = EMPTY_DELIMITER_;
398        if (delim != null && delim.length() > 0) {
399            m_delimiters_ = new UnicodeSet();
400            m_delimiters_.addAll(delim);
401        }
402        return nextToken(m_delimiters_);
403    }
404
405    /**
406     * {@icu} Returns the next token in this string tokenizer's string. First,
407     * the set of characters considered to be delimiters by this
408     * <tt>StringTokenizer</tt> object is changed to be the characters in
409     * the string <tt>delim</tt>. Then the next token in the string
410     * after the current position is returned. The current position is
411     * advanced beyond the recognized token.  The new delimiter set
412     * remains the default after this call.
413     * @param delim the new delimiters.
414     * @return the next token, after switching to the new delimiter set.
415     * @exception NoSuchElementException if there are no more tokens in
416     *            this tokenizer's string.
417     * @stable ICU 2.4
418     */
419    public String nextToken(UnicodeSet delim)
420    {
421        m_delimiters_ = delim;
422        checkDelimiters();
423        m_tokenOffset_ = -1;
424        m_tokenSize_ = -1;
425        if (!m_returnDelimiters_) {
426            m_nextOffset_ = getNextNonDelimiter(m_nextOffset_);
427        }
428        return nextToken();
429    }
430
431    /**
432     * Returns the same value as the <code>hasMoreTokens</code> method.
433     * It exists so that this class can implement the
434     * <code>Enumeration</code> interface.
435     * @return <code>true</code> if there are more tokens;
436     *         <code>false</code> otherwise.
437     * @see #hasMoreTokens()
438     * @stable ICU 2.4
439     */
440    public boolean hasMoreElements()
441    {
442        return hasMoreTokens();
443    }
444
445    /**
446     * Returns the same value as the <code>nextToken</code> method, except
447     * that its declared return value is <code>Object</code> rather than
448     * <code>String</code>. It exists so that this class can implement the
449     * <code>Enumeration</code> interface.
450     * @return the next token in the string.
451     * @exception NoSuchElementException if there are no more tokens in
452     *            this tokenizer's string.
453     * @see #nextToken()
454     * @stable ICU 2.4
455     */
456    public Object nextElement()
457    {
458        return nextToken();
459    }
460
461    /**
462     * Calculates the number of times that this tokenizer's
463     * <code>nextToken</code> method can be called before it generates an
464     * exception. The current position is not advanced.
465     * @return the number of tokens remaining in the string using the
466     *         current delimiter set.
467     * @see #nextToken()
468     * @stable ICU 2.4
469     */
470    public int countTokens()
471    {
472        int result = 0;
473        if (hasMoreTokens()) {
474            if (m_tokenOffset_ >= 0) {
475                return m_tokenSize_ - m_tokenOffset_;
476            }
477            if (m_tokenStart_ == null) {
478                m_tokenStart_ = new int[TOKEN_SIZE_];
479                m_tokenLimit_ = new int[TOKEN_SIZE_];
480            }
481            do {
482                if (m_tokenStart_.length == result) {
483                    int temptokenindex[] = m_tokenStart_;
484                    int temptokensize[] = m_tokenLimit_;
485                    int originalsize = temptokenindex.length;
486                    int newsize = originalsize + TOKEN_SIZE_;
487                    m_tokenStart_ = new int[newsize];
488                    m_tokenLimit_ = new int[newsize];
489                    System.arraycopy(temptokenindex, 0, m_tokenStart_, 0,
490                                     originalsize);
491                    System.arraycopy(temptokensize, 0, m_tokenLimit_, 0,
492                                     originalsize);
493                }
494                m_tokenStart_[result] = m_nextOffset_;
495                if (m_returnDelimiters_) {
496                    int c = UTF16.charAt(m_source_, m_nextOffset_);
497                    boolean contains = delims == null
498                        ? m_delimiters_.contains(c)
499                        : c < delims.length && delims[c];
500                    if (contains) {
501                        if (m_coalesceDelimiters_) {
502                            m_tokenLimit_[result] = getNextNonDelimiter(
503                                                                m_nextOffset_);
504                        } else {
505                            int p = m_nextOffset_ + 1;
506                            if (p == m_length_) {
507                                p = -1;
508                            }
509                            m_tokenLimit_[result] = p;
510
511                        }
512                    }
513                    else {
514                        m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
515                    }
516                    m_nextOffset_ = m_tokenLimit_[result];
517                }
518                else {
519                    m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
520                    m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
521                }
522                result ++;
523            } while (m_nextOffset_ >= 0);
524            m_tokenOffset_ = 0;
525            m_tokenSize_ = result;
526            m_nextOffset_ = m_tokenStart_[0];
527        }
528        return result;
529    }
530
531    // private data members -------------------------------------------------
532
533    /**
534     * Current offset to the token array. If the array token is not set up yet,
535     * this value is a -1
536     */
537    private int m_tokenOffset_;
538    /**
539     * Size of the token array. If the array token is not set up yet,
540     * this value is a -1
541     */
542    private int m_tokenSize_;
543    /**
544     * Array of pre-calculated tokens start indexes in source string terminated
545     * by -1.
546     * This is only set up during countTokens() and only stores the remaining
547     * tokens, not all tokens including parsed ones
548     */
549    private int m_tokenStart_[];
550    /**
551     * Array of pre-calculated tokens limit indexes in source string.
552     * This is only set up during countTokens() and only stores the remaining
553     * tokens, not all tokens including parsed ones
554     */
555    private int m_tokenLimit_[];
556    /**
557     * UnicodeSet containing delimiters
558     */
559    private UnicodeSet m_delimiters_;
560    /**
561     * String to parse for tokens
562     */
563    private String m_source_;
564    /**
565     * Length of m_source_
566     */
567    private int m_length_;
568    /**
569     * Current position in string to parse for tokens
570     */
571    private int m_nextOffset_;
572    /**
573     * Flag indicator if delimiters are to be treated as tokens too
574     */
575    private boolean m_returnDelimiters_;
576
577    /**
578     * Flag indicating whether to coalesce runs of delimiters into single tokens
579     */
580    private boolean m_coalesceDelimiters_;
581
582    /**
583     * Default set of delimiters &#92;t&#92;n&#92;r&#92;f
584     */
585    private static final UnicodeSet DEFAULT_DELIMITERS_
586        = new UnicodeSet(0x09, 0x0a, 0x0c, 0x0d, 0x20, 0x20);   // UnicodeSet("[ \t\n\r\f]", false)
587    /**
588     * Array size increments
589     */
590    private static final int TOKEN_SIZE_ = 100;
591    /**
592     * A empty delimiter UnicodeSet, used when user specified null delimiters
593     */
594    private static final UnicodeSet EMPTY_DELIMITER_ = UnicodeSet.EMPTY;
595
596    // private methods ------------------------------------------------------
597
598    /**
599     * Gets the index of the next delimiter after offset
600     * @param offset to the source string
601     * @return offset of the immediate next delimiter, otherwise
602     *         (- source string length - 1) if there
603     *         are no more delimiters after m_nextOffset
604     */
605    private int getNextDelimiter(int offset)
606    {
607        if (offset >= 0) {
608            int result = offset;
609            int c = 0;
610            if (delims == null) {
611                do {
612                    c = UTF16.charAt(m_source_, result);
613                    if (m_delimiters_.contains(c)) {
614                        break;
615                    }
616                    result ++;
617                } while (result < m_length_);
618            } else {
619                do {
620                    c = UTF16.charAt(m_source_, result);
621                    if (c < delims.length && delims[c]) {
622                        break;
623                    }
624                    result ++;
625                } while (result < m_length_);
626            }
627            if (result < m_length_) {
628                return result;
629            }
630        }
631        return -1 - m_length_;
632    }
633
634    /**
635     * Gets the index of the next non-delimiter after m_nextOffset_
636     * @param offset to the source string
637     * @return offset of the immediate next non-delimiter, otherwise
638     *         (- source string length - 1) if there
639     *         are no more delimiters after m_nextOffset
640     */
641    private int getNextNonDelimiter(int offset)
642    {
643        if (offset >= 0) {
644            int result = offset;
645            int c = 0;
646            if (delims == null) {
647                do {
648                    c = UTF16.charAt(m_source_, result);
649                    if (!m_delimiters_.contains(c)) {
650                        break;
651                    }
652                    result ++;
653                } while (result < m_length_);
654            } else {
655                do {
656                    c = UTF16.charAt(m_source_, result);
657                    if (!(c < delims.length && delims[c])) {
658                        break;
659                    }
660                    result ++;
661                } while (result < m_length_);
662            }
663            if (result < m_length_) {
664                return result;
665            }
666        }
667        return -1 - m_length_;
668    }
669
670    void checkDelimiters() {
671        if (m_delimiters_ == null || m_delimiters_.size() == 0) {
672            delims = new boolean[0];
673        } else {
674            int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1);
675            if (maxChar < 0x7f) {
676                delims = new boolean[maxChar+1];
677                for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) {
678                    delims[ch] = true;
679                }
680            } else {
681                delims = null;
682            }
683        }
684    }
685    private boolean[] delims;
686}
687