1/* 2********************************************************************** 3* Copyright (c) 2003-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* Author: Alan Liu 7* Created: September 23 2003 8* Since: ICU 2.8 9********************************************************************** 10*/ 11package com.ibm.icu.impl; 12 13import java.text.ParsePosition; 14 15import com.ibm.icu.text.SymbolTable; 16import com.ibm.icu.text.UTF16; 17 18/** 19 * An iterator that returns 32-bit code points. This class is deliberately 20 * <em>not</em> related to any of the JDK or ICU4J character iterator classes 21 * in order to minimize complexity. 22 * @author Alan Liu 23 * @since ICU 2.8 24 */ 25public class RuleCharacterIterator { 26 27 // TODO: Ideas for later. (Do not implement if not needed, lest the 28 // code coverage numbers go down due to unused methods.) 29 // 1. Add a copy constructor, equals() method, clone() method. 30 // 2. Rather than return DONE, throw an exception if the end 31 // is reached -- this is an alternate usage model, probably not useful. 32 // 3. Return isEscaped from next(). If this happens, 33 // don't keep an isEscaped member variable. 34 35 /** 36 * Text being iterated. 37 */ 38 private String text; 39 40 /** 41 * Position of iterator. 42 */ 43 private ParsePosition pos; 44 45 /** 46 * Symbol table used to parse and dereference variables. May be null. 47 */ 48 private SymbolTable sym; 49 50 /** 51 * Current variable expansion, or null if none. 52 */ 53 private char[] buf; 54 55 /** 56 * Position within buf[]. Meaningless if buf == null. 57 */ 58 private int bufPos; 59 60 /** 61 * Flag indicating whether the last character was parsed from an escape. 62 */ 63 private boolean isEscaped; 64 65 /** 66 * Value returned when there are no more characters to iterate. 67 */ 68 public static final int DONE = -1; 69 70 /** 71 * Bitmask option to enable parsing of variable names. If (options & 72 * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to 73 * its value. Variables are parsed using the SymbolTable API. 74 */ 75 public static final int PARSE_VARIABLES = 1; 76 77 /** 78 * Bitmask option to enable parsing of escape sequences. If (options & 79 * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded 80 * to its value. Escapes are parsed using Utility.unescapeAt(). 81 */ 82 public static final int PARSE_ESCAPES = 2; 83 84 /** 85 * Bitmask option to enable skipping of whitespace. If (options & 86 * SKIP_WHITESPACE) != 0, then Unicode Pattern_White_Space characters will be silently 87 * skipped, as if they were not present in the input. 88 */ 89 public static final int SKIP_WHITESPACE = 4; 90 91 /** 92 * Constructs an iterator over the given text, starting at the given 93 * position. 94 * @param text the text to be iterated 95 * @param sym the symbol table, or null if there is none. If sym is null, 96 * then variables will not be deferenced, even if the PARSE_VARIABLES 97 * option is set. 98 * @param pos upon input, the index of the next character to return. If a 99 * variable has been dereferenced, then pos will <em>not</em> increment as 100 * characters of the variable value are iterated. 101 */ 102 public RuleCharacterIterator(String text, SymbolTable sym, 103 ParsePosition pos) { 104 if (text == null || pos.getIndex() > text.length()) { 105 throw new IllegalArgumentException(); 106 } 107 this.text = text; 108 this.sym = sym; 109 this.pos = pos; 110 buf = null; 111 } 112 113 /** 114 * Returns true if this iterator has no more characters to return. 115 */ 116 public boolean atEnd() { 117 return buf == null && pos.getIndex() == text.length(); 118 } 119 120 /** 121 * Returns the next character using the given options, or DONE if there 122 * are no more characters, and advance the position to the next 123 * character. 124 * @param options one or more of the following options, bitwise-OR-ed 125 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 126 * @return the current 32-bit code point, or DONE 127 */ 128 public int next(int options) { 129 int c = DONE; 130 isEscaped = false; 131 132 for (;;) { 133 c = _current(); 134 _advance(UTF16.getCharCount(c)); 135 136 if (c == SymbolTable.SYMBOL_REF && buf == null && 137 (options & PARSE_VARIABLES) != 0 && sym != null) { 138 String name = sym.parseReference(text, pos, text.length()); 139 // If name == null there was an isolated SYMBOL_REF; 140 // return it. Caller must be prepared for this. 141 if (name == null) { 142 break; 143 } 144 bufPos = 0; 145 buf = sym.lookup(name); 146 if (buf == null) { 147 throw new IllegalArgumentException( 148 "Undefined variable: " + name); 149 } 150 // Handle empty variable value 151 if (buf.length == 0) { 152 buf = null; 153 } 154 continue; 155 } 156 157 if ((options & SKIP_WHITESPACE) != 0 && 158 PatternProps.isWhiteSpace(c)) { 159 continue; 160 } 161 162 if (c == '\\' && (options & PARSE_ESCAPES) != 0) { 163 int offset[] = new int[] { 0 }; 164 c = Utility.unescapeAt(lookahead(), offset); 165 jumpahead(offset[0]); 166 isEscaped = true; 167 if (c < 0) { 168 throw new IllegalArgumentException("Invalid escape"); 169 } 170 } 171 172 break; 173 } 174 175 return c; 176 } 177 178 /** 179 * Returns true if the last character returned by next() was 180 * escaped. This will only be the case if the option passed in to 181 * next() included PARSE_ESCAPED and the next character was an 182 * escape sequence. 183 */ 184 public boolean isEscaped() { 185 return isEscaped; 186 } 187 188 /** 189 * Returns true if this iterator is currently within a variable expansion. 190 */ 191 public boolean inVariable() { 192 return buf != null; 193 } 194 195 /** 196 * Returns an object which, when later passed to setPos(), will 197 * restore this iterator's position. Usage idiom: 198 * 199 * RuleCharacterIterator iterator = ...; 200 * Object pos = iterator.getPos(null); // allocate position object 201 * for (;;) { 202 * pos = iterator.getPos(pos); // reuse position object 203 * int c = iterator.next(...); 204 * ... 205 * } 206 * iterator.setPos(pos); 207 * 208 * @param p a position object previously returned by getPos(), 209 * or null. If not null, it will be updated and returned. If 210 * null, a new position object will be allocated and returned. 211 * @return a position object which may be passed to setPos(), 212 * either `p,' or if `p' == null, a newly-allocated object 213 */ 214 public Object getPos(Object p) { 215 if (p == null) { 216 return new Object[] {buf, new int[] {pos.getIndex(), bufPos}}; 217 } 218 Object[] a = (Object[]) p; 219 a[0] = buf; 220 int[] v = (int[]) a[1]; 221 v[0] = pos.getIndex(); 222 v[1] = bufPos; 223 return p; 224 } 225 226 /** 227 * Restores this iterator to the position it had when getPos() 228 * returned the given object. 229 * @param p a position object previously returned by getPos() 230 */ 231 public void setPos(Object p) { 232 Object[] a = (Object[]) p; 233 buf = (char[]) a[0]; 234 int[] v = (int[]) a[1]; 235 pos.setIndex(v[0]); 236 bufPos = v[1]; 237 } 238 239 /** 240 * Skips ahead past any ignored characters, as indicated by the given 241 * options. This is useful in conjunction with the lookahead() method. 242 * 243 * Currently, this only has an effect for SKIP_WHITESPACE. 244 * @param options one or more of the following options, bitwise-OR-ed 245 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 246 */ 247 public void skipIgnored(int options) { 248 if ((options & SKIP_WHITESPACE) != 0) { 249 for (;;) { 250 int a = _current(); 251 if (!PatternProps.isWhiteSpace(a)) break; 252 _advance(UTF16.getCharCount(a)); 253 } 254 } 255 } 256 257 /** 258 * Returns a string containing the remainder of the characters to be 259 * returned by this iterator, without any option processing. If the 260 * iterator is currently within a variable expansion, this will only 261 * extend to the end of the variable expansion. This method is provided 262 * so that iterators may interoperate with string-based APIs. The typical 263 * sequence of calls is to call skipIgnored(), then call lookahead(), then 264 * parse the string returned by lookahead(), then call jumpahead() to 265 * resynchronize the iterator. 266 * @return a string containing the characters to be returned by future 267 * calls to next() 268 */ 269 public String lookahead() { 270 if (buf != null) { 271 return new String(buf, bufPos, buf.length - bufPos); 272 } else { 273 return text.substring(pos.getIndex()); 274 } 275 } 276 277 /** 278 * Advances the position by the given number of 16-bit code units. 279 * This is useful in conjunction with the lookahead() method. 280 * @param count the number of 16-bit code units to jump over 281 */ 282 public void jumpahead(int count) { 283 if (count < 0) { 284 throw new IllegalArgumentException(); 285 } 286 if (buf != null) { 287 bufPos += count; 288 if (bufPos > buf.length) { 289 throw new IllegalArgumentException(); 290 } 291 if (bufPos == buf.length) { 292 buf = null; 293 } 294 } else { 295 int i = pos.getIndex() + count; 296 pos.setIndex(i); 297 if (i > text.length()) { 298 throw new IllegalArgumentException(); 299 } 300 } 301 } 302 303 /** 304 * Returns a string representation of this object, consisting of the 305 * characters being iterated, with a '|' marking the current position. 306 * Position within an expanded variable is <em>not</em> indicated. 307 * @return a string representation of this object 308 */ 309 public String toString() { 310 int b = pos.getIndex(); 311 return text.substring(0, b) + '|' + text.substring(b); 312 } 313 314 /** 315 * Returns the current 32-bit code point without parsing escapes, parsing 316 * variables, or skipping whitespace. 317 * @return the current 32-bit code point 318 */ 319 private int _current() { 320 if (buf != null) { 321 return UTF16.charAt(buf, 0, buf.length, bufPos); 322 } else { 323 int i = pos.getIndex(); 324 return (i < text.length()) ? UTF16.charAt(text, i) : DONE; 325 } 326 } 327 328 /** 329 * Advances the position by the given amount. 330 * @param count the number of 16-bit code units to advance past 331 */ 332 private void _advance(int count) { 333 if (buf != null) { 334 bufPos += count; 335 if (bufPos == buf.length) { 336 buf = null; 337 } 338 } else { 339 pos.setIndex(pos.getIndex() + count); 340 if (pos.getIndex() > text.length()) { 341 pos.setIndex(text.length()); 342 } 343 } 344 } 345}