1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4**********************************************************************
5* Copyright (c) 2003-2011, International Business Machines
6* Corporation and others.  All Rights Reserved.
7**********************************************************************
8* Author: Alan Liu
9* Created: September 23 2003
10* Since: ICU 2.8
11**********************************************************************
12*/
13package com.ibm.icu.impl;
14
15import java.text.ParsePosition;
16
17import com.ibm.icu.text.SymbolTable;
18import com.ibm.icu.text.UTF16;
19
20/**
21 * An iterator that returns 32-bit code points.  This class is deliberately
22 * <em>not</em> related to any of the JDK or ICU4J character iterator classes
23 * in order to minimize complexity.
24 * @author Alan Liu
25 * @since ICU 2.8
26 */
27public class RuleCharacterIterator {
28
29    // TODO: Ideas for later.  (Do not implement if not needed, lest the
30    // code coverage numbers go down due to unused methods.)
31    // 1. Add a copy constructor, equals() method, clone() method.
32    // 2. Rather than return DONE, throw an exception if the end
33    // is reached -- this is an alternate usage model, probably not useful.
34    // 3. Return isEscaped from next().  If this happens,
35    // don't keep an isEscaped member variable.
36
37    /**
38     * Text being iterated.
39     */
40    private String text;
41
42    /**
43     * Position of iterator.
44     */
45    private ParsePosition pos;
46
47    /**
48     * Symbol table used to parse and dereference variables.  May be null.
49     */
50    private SymbolTable sym;
51
52    /**
53     * Current variable expansion, or null if none.
54     */
55    private char[] buf;
56
57    /**
58     * Position within buf[].  Meaningless if buf == null.
59     */
60    private int bufPos;
61
62    /**
63     * Flag indicating whether the last character was parsed from an escape.
64     */
65    private boolean isEscaped;
66
67    /**
68     * Value returned when there are no more characters to iterate.
69     */
70    public static final int DONE = -1;
71
72    /**
73     * Bitmask option to enable parsing of variable names.  If (options &
74     * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
75     * its value.  Variables are parsed using the SymbolTable API.
76     */
77    public static final int PARSE_VARIABLES = 1;
78
79    /**
80     * Bitmask option to enable parsing of escape sequences.  If (options &
81     * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
82     * to its value.  Escapes are parsed using Utility.unescapeAt().
83     */
84    public static final int PARSE_ESCAPES   = 2;
85
86    /**
87     * Bitmask option to enable skipping of whitespace.  If (options &
88     * SKIP_WHITESPACE) != 0, then Unicode Pattern_White_Space characters will be silently
89     * skipped, as if they were not present in the input.
90     */
91    public static final int SKIP_WHITESPACE = 4;
92
93    /**
94     * Constructs an iterator over the given text, starting at the given
95     * position.
96     * @param text the text to be iterated
97     * @param sym the symbol table, or null if there is none.  If sym is null,
98     * then variables will not be deferenced, even if the PARSE_VARIABLES
99     * option is set.
100     * @param pos upon input, the index of the next character to return.  If a
101     * variable has been dereferenced, then pos will <em>not</em> increment as
102     * characters of the variable value are iterated.
103     */
104    public RuleCharacterIterator(String text, SymbolTable sym,
105                                 ParsePosition pos) {
106        if (text == null || pos.getIndex() > text.length()) {
107            throw new IllegalArgumentException();
108        }
109        this.text = text;
110        this.sym = sym;
111        this.pos = pos;
112        buf = null;
113    }
114
115    /**
116     * Returns true if this iterator has no more characters to return.
117     */
118    public boolean atEnd() {
119        return buf == null && pos.getIndex() == text.length();
120    }
121
122    /**
123     * Returns the next character using the given options, or DONE if there
124     * are no more characters, and advance the position to the next
125     * character.
126     * @param options one or more of the following options, bitwise-OR-ed
127     * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
128     * @return the current 32-bit code point, or DONE
129     */
130    public int next(int options) {
131        int c = DONE;
132        isEscaped = false;
133
134        for (;;) {
135            c = _current();
136            _advance(UTF16.getCharCount(c));
137
138            if (c == SymbolTable.SYMBOL_REF && buf == null &&
139                (options & PARSE_VARIABLES) != 0 && sym != null) {
140                String name = sym.parseReference(text, pos, text.length());
141                // If name == null there was an isolated SYMBOL_REF;
142                // return it.  Caller must be prepared for this.
143                if (name == null) {
144                    break;
145                }
146                bufPos = 0;
147                buf = sym.lookup(name);
148                if (buf == null) {
149                    throw new IllegalArgumentException(
150                                "Undefined variable: " + name);
151                }
152                // Handle empty variable value
153                if (buf.length == 0) {
154                    buf = null;
155                }
156                continue;
157            }
158
159            if ((options & SKIP_WHITESPACE) != 0 &&
160                PatternProps.isWhiteSpace(c)) {
161                continue;
162            }
163
164            if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
165                int offset[] = new int[] { 0 };
166                c = Utility.unescapeAt(lookahead(), offset);
167                jumpahead(offset[0]);
168                isEscaped = true;
169                if (c < 0) {
170                    throw new IllegalArgumentException("Invalid escape");
171                }
172            }
173
174            break;
175        }
176
177        return c;
178    }
179
180    /**
181     * Returns true if the last character returned by next() was
182     * escaped.  This will only be the case if the option passed in to
183     * next() included PARSE_ESCAPED and the next character was an
184     * escape sequence.
185     */
186    public boolean isEscaped() {
187        return isEscaped;
188    }
189
190    /**
191     * Returns true if this iterator is currently within a variable expansion.
192     */
193    public boolean inVariable() {
194        return buf != null;
195    }
196
197    /**
198     * Returns an object which, when later passed to setPos(), will
199     * restore this iterator's position.  Usage idiom:
200     *
201     * RuleCharacterIterator iterator = ...;
202     * Object pos = iterator.getPos(null); // allocate position object
203     * for (;;) {
204     *   pos = iterator.getPos(pos); // reuse position object
205     *   int c = iterator.next(...);
206     *   ...
207     * }
208     * iterator.setPos(pos);
209     *
210     * @param p a position object previously returned by getPos(),
211     * or null.  If not null, it will be updated and returned.  If
212     * null, a new position object will be allocated and returned.
213     * @return a position object which may be passed to setPos(),
214     * either `p,' or if `p' == null, a newly-allocated object
215     */
216    public Object getPos(Object p) {
217        if (p == null) {
218            return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
219        }
220        Object[] a = (Object[]) p;
221        a[0] = buf;
222        int[] v = (int[]) a[1];
223        v[0] = pos.getIndex();
224        v[1] = bufPos;
225        return p;
226    }
227
228    /**
229     * Restores this iterator to the position it had when getPos()
230     * returned the given object.
231     * @param p a position object previously returned by getPos()
232     */
233    public void setPos(Object p) {
234        Object[] a = (Object[]) p;
235        buf = (char[]) a[0];
236        int[] v = (int[]) a[1];
237        pos.setIndex(v[0]);
238        bufPos = v[1];
239    }
240
241    /**
242     * Skips ahead past any ignored characters, as indicated by the given
243     * options.  This is useful in conjunction with the lookahead() method.
244     *
245     * Currently, this only has an effect for SKIP_WHITESPACE.
246     * @param options one or more of the following options, bitwise-OR-ed
247     * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
248     */
249    public void skipIgnored(int options) {
250        if ((options & SKIP_WHITESPACE) != 0) {
251            for (;;) {
252                int a = _current();
253                if (!PatternProps.isWhiteSpace(a)) break;
254                _advance(UTF16.getCharCount(a));
255            }
256        }
257    }
258
259    /**
260     * Returns a string containing the remainder of the characters to be
261     * returned by this iterator, without any option processing.  If the
262     * iterator is currently within a variable expansion, this will only
263     * extend to the end of the variable expansion.  This method is provided
264     * so that iterators may interoperate with string-based APIs.  The typical
265     * sequence of calls is to call skipIgnored(), then call lookahead(), then
266     * parse the string returned by lookahead(), then call jumpahead() to
267     * resynchronize the iterator.
268     * @return a string containing the characters to be returned by future
269     * calls to next()
270     */
271    public String lookahead() {
272        if (buf != null) {
273            return new String(buf, bufPos, buf.length - bufPos);
274        } else {
275            return text.substring(pos.getIndex());
276        }
277    }
278
279    /**
280     * Advances the position by the given number of 16-bit code units.
281     * This is useful in conjunction with the lookahead() method.
282     * @param count the number of 16-bit code units to jump over
283     */
284    public void jumpahead(int count) {
285        if (count < 0) {
286            throw new IllegalArgumentException();
287        }
288        if (buf != null) {
289            bufPos += count;
290            if (bufPos > buf.length) {
291                throw new IllegalArgumentException();
292            }
293            if (bufPos == buf.length) {
294                buf = null;
295            }
296        } else {
297            int i = pos.getIndex() + count;
298            pos.setIndex(i);
299            if (i > text.length()) {
300                throw new IllegalArgumentException();
301            }
302        }
303    }
304
305    /**
306     * Returns a string representation of this object, consisting of the
307     * characters being iterated, with a '|' marking the current position.
308     * Position within an expanded variable is <em>not</em> indicated.
309     * @return a string representation of this object
310     */
311    @Override
312    public String toString() {
313        int b = pos.getIndex();
314        return text.substring(0, b) + '|' + text.substring(b);
315    }
316
317    /**
318     * Returns the current 32-bit code point without parsing escapes, parsing
319     * variables, or skipping whitespace.
320     * @return the current 32-bit code point
321     */
322    private int _current() {
323        if (buf != null) {
324            return UTF16.charAt(buf, 0, buf.length, bufPos);
325        } else {
326            int i = pos.getIndex();
327            return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
328        }
329    }
330
331    /**
332     * Advances the position by the given amount.
333     * @param count the number of 16-bit code units to advance past
334     */
335    private void _advance(int count) {
336        if (buf != null) {
337            bufPos += count;
338            if (bufPos == buf.length) {
339                buf = null;
340            }
341        } else {
342            pos.setIndex(pos.getIndex() + count);
343            if (pos.getIndex() > text.length()) {
344                pos.setIndex(text.length());
345            }
346        }
347    }
348}