1/* 2 ******************************************************************************* 3 * Copyright (C) 2006-2009, Google, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7package com.ibm.icu.impl; 8 9import com.ibm.icu.text.UTF16; 10import com.ibm.icu.text.UnicodeSet; 11 12/** 13 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax. 14 * The '' (two quotes) is treated as a single quote, inside or outside a quote 15 * <ul> 16 * <li>Any ignorable characters are ignored in parsing.</li> 17 * <li>Any syntax characters are broken into separate tokens</li> 18 * <li>Quote characters can be specified: '...', "...", and \x </li> 19 * <li>Other characters are treated as literals</li> 20 * </ul> 21 */ 22public class PatternTokenizer { 23 // settings used in the interpretation of the pattern 24 private UnicodeSet ignorableCharacters = new UnicodeSet(); 25 private UnicodeSet syntaxCharacters = new UnicodeSet(); 26 private UnicodeSet extraQuotingCharacters = new UnicodeSet(); 27 private UnicodeSet escapeCharacters = new UnicodeSet(); 28 private boolean usingSlash = false; 29 private boolean usingQuote = false; 30 31 // transient data, set when needed. Null it out for any changes in the above fields. 32 private transient UnicodeSet needingQuoteCharacters = null; 33 34 // data about the current pattern being parsed. start gets moved as we go along. 35 private int start; 36 private int limit; 37 private String pattern; 38 39 public UnicodeSet getIgnorableCharacters() { 40 return (UnicodeSet) ignorableCharacters.clone(); 41 } 42 /** 43 * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); 44 * @param ignorableCharacters Characters to be ignored. 45 * @return A PatternTokenizer object in which characters are specified as ignored characters. 46 */ 47 public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { 48 this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); 49 needingQuoteCharacters = null; 50 return this; 51 } 52 public UnicodeSet getSyntaxCharacters() { 53 return (UnicodeSet) syntaxCharacters.clone(); 54 } 55 public UnicodeSet getExtraQuotingCharacters() { 56 return (UnicodeSet) extraQuotingCharacters.clone(); 57 } 58 /** 59 * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") 60 * @param syntaxCharacters Characters to be set as syntax characters. 61 * @return A PatternTokenizer object in which characters are specified as syntax characters. 62 */ 63 public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { 64 this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); 65 needingQuoteCharacters = null; 66 return this; 67 } 68 /** 69 * Sets the extra characters to be quoted in literals 70 * @param syntaxCharacters Characters to be set as extra quoting characters. 71 * @return A PatternTokenizer object in which characters are specified as extra quoting characters. 72 */ 73 public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { 74 this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); 75 needingQuoteCharacters = null; 76 return this; 77 } 78 79 public UnicodeSet getEscapeCharacters() { 80 return (UnicodeSet) escapeCharacters.clone(); 81 } 82 /** 83 * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); 84 * @param escapeCharacters Characters to be set as escape characters. 85 * @return A PatternTokenizer object in which characters are specified as escape characters. 86 */ 87 public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { 88 this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); 89 return this; 90 } 91 public boolean isUsingQuote() { 92 return usingQuote; 93 } 94 public PatternTokenizer setUsingQuote(boolean usingQuote) { 95 this.usingQuote = usingQuote; 96 needingQuoteCharacters = null; 97 return this; 98 } 99 public boolean isUsingSlash() { 100 return usingSlash; 101 } 102 public PatternTokenizer setUsingSlash(boolean usingSlash) { 103 this.usingSlash = usingSlash; 104 needingQuoteCharacters = null; 105 return this; 106 } 107 // public UnicodeSet getQuoteCharacters() { 108// return (UnicodeSet) quoteCharacters.clone(); 109// } 110// public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { 111// this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); 112// needingQuoteCharacters = null; 113// return this; 114// } 115 public int getLimit() { 116 return limit; 117 } 118 public PatternTokenizer setLimit(int limit) { 119 this.limit = limit; 120 return this; 121 } 122 public int getStart() { 123 return start; 124 } 125 public PatternTokenizer setStart(int start) { 126 this.start = start; 127 return this; 128 } 129 130 public PatternTokenizer setPattern(CharSequence pattern) { 131 return setPattern(pattern.toString()); 132 } 133 134 public PatternTokenizer setPattern(String pattern) { 135 if (pattern == null) { 136 throw new IllegalArgumentException("Inconsistent arguments"); 137 } 138 this.start = 0; 139 this.limit = pattern.length(); 140 this.pattern = pattern; 141 return this; 142 } 143 144 public static final char SINGLE_QUOTE = '\''; 145 public static final char BACK_SLASH = '\\'; 146 private static int NO_QUOTE = -1, IN_QUOTE = -2; 147 148 public String quoteLiteral(CharSequence string) { 149 return quoteLiteral(string.toString()); 150 } 151 152 /** 153 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. 154 * @param string String passed to quote a literal string. 155 * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. 156 */ 157 public String quoteLiteral(String string) { 158 if (needingQuoteCharacters == null) { 159 needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) 160 if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); 161 if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); 162 } 163 StringBuffer result = new StringBuffer(); 164 int quotedChar = NO_QUOTE; 165 int cp; 166 for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { 167 cp = UTF16.charAt(string, i); 168 if (escapeCharacters.contains(cp)) { 169 // we may have to fix up previous characters 170 if (quotedChar == IN_QUOTE) { 171 result.append(SINGLE_QUOTE); 172 quotedChar = NO_QUOTE; 173 } 174 appendEscaped(result, cp); 175 continue; 176 } 177 178 if (needingQuoteCharacters.contains(cp)) { 179 // if we have already started a quote 180 if (quotedChar == IN_QUOTE) { 181 UTF16.append(result, cp); 182 if (usingQuote && cp == SINGLE_QUOTE) { // double it 183 result.append(SINGLE_QUOTE); 184 } 185 continue; 186 } 187 // otherwise not already in quote 188 if (usingSlash) { 189 result.append(BACK_SLASH); 190 UTF16.append(result, cp); 191 continue; 192 } 193 if (usingQuote) { 194 if (cp == SINGLE_QUOTE) { // double it and continue 195 result.append(SINGLE_QUOTE); 196 result.append(SINGLE_QUOTE); 197 continue; 198 } 199 result.append(SINGLE_QUOTE); 200 UTF16.append(result, cp); 201 quotedChar = IN_QUOTE; 202 continue; 203 } 204 // we have no choice but to use \\u or \\U 205 appendEscaped(result, cp); 206 continue; 207 } 208 // otherwise cp doesn't need quoting 209 // we may have to fix up previous characters 210 if (quotedChar == IN_QUOTE) { 211 result.append(SINGLE_QUOTE); 212 quotedChar = NO_QUOTE; 213 } 214 UTF16.append(result, cp); 215 } 216 // all done. 217 // we may have to fix up previous characters 218 if (quotedChar == IN_QUOTE) { 219 result.append(SINGLE_QUOTE); 220 } 221 return result.toString(); 222 } 223 224 private void appendEscaped(StringBuffer result, int cp) { 225 if (cp <= 0xFFFF) { 226 result.append("\\u").append(Utility.hex(cp,4)); 227 } else { 228 result.append("\\U").append(Utility.hex(cp,8)); 229 } 230 } 231 232 public String normalize() { 233 int oldStart = start; 234 StringBuffer result = new StringBuffer(); 235 StringBuffer buffer = new StringBuffer(); 236 while (true) { 237 buffer.setLength(0); 238 int status = next(buffer); 239 if (status == DONE) { 240 start = oldStart; 241 return result.toString(); 242 } 243 if (status != SYNTAX) { 244 result.append(quoteLiteral(buffer)); 245 } else { 246 result.append(buffer); 247 } 248 } 249 } 250 251 public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; 252 253 private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; 254 255 public int next(StringBuffer buffer) { 256 if (start >= limit) return DONE; 257 int status = UNKNOWN; 258 int lastQuote = UNKNOWN; 259 int quoteStatus = NONE; 260 int hexCount = 0; 261 int hexValue = 0; 262 int cp; 263 main: 264 for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { 265 cp = UTF16.charAt(pattern, i); 266 // if we are in a quote, then handle it. 267 switch (quoteStatus) { 268 case SLASH_START: 269 switch (cp) { 270 case 'u': 271 quoteStatus = HEX; 272 hexCount = 4; 273 hexValue = 0; 274 continue main; 275 case 'U': 276 quoteStatus = HEX; 277 hexCount = 8; 278 hexValue = 0; 279 continue main; 280 default: 281 if (usingSlash) { 282 UTF16.append(buffer, cp); 283 quoteStatus = NONE; 284 continue main; 285 } else { 286 buffer.append(BACK_SLASH); 287 quoteStatus = NONE; 288 } 289 } 290 break; // fall through to NONE 291 case HEX: 292 hexValue <<= 4; 293 hexValue += cp; 294 switch (cp) { 295 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': 296 hexValue -= '0'; break; 297 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 298 hexValue -= 'a' - 10; break; 299 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 300 hexValue -= 'A' - 10; break; 301 default: 302 start = i; 303 return BROKEN_ESCAPE; 304 } 305 --hexCount; 306 if (hexCount == 0) { 307 quoteStatus = NONE; 308 UTF16.append(buffer, hexValue); 309 } 310 continue main; 311 case AFTER_QUOTE: 312 // see if we get another quote character 313 // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote 314 if (cp == lastQuote) { 315 UTF16.append(buffer, cp); 316 quoteStatus = NORMAL_QUOTE; 317 continue main; 318 } 319 quoteStatus = NONE; 320 break; // fall through to NONE 321 case START_QUOTE: 322 // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote 323 if (cp == lastQuote) { 324 UTF16.append(buffer, cp); 325 quoteStatus = NONE; // get out of quote, with no trace remaining 326 continue; 327 } 328 // otherwise get into quote 329 UTF16.append(buffer, cp); 330 quoteStatus = NORMAL_QUOTE; 331 continue main; 332 case NORMAL_QUOTE: 333 if (cp == lastQuote) { 334 quoteStatus = AFTER_QUOTE; // get out of quote 335 continue main; 336 } 337 UTF16.append(buffer, cp); 338 continue main; 339 } 340 341 if (ignorableCharacters.contains(cp)) { 342 continue; 343 } 344 // do syntax characters 345 if (syntaxCharacters.contains(cp)) { 346 if (status == UNKNOWN) { 347 UTF16.append(buffer, cp); 348 start = i + UTF16.getCharCount(cp); 349 return SYNTAX; 350 } else { // LITERAL, so back up and break 351 start = i; 352 return status; 353 } 354 } 355 // otherwise it is a literal; keep on going 356 status = LITERAL; 357 if (cp == BACK_SLASH) { 358 quoteStatus = SLASH_START; 359 continue; 360 } else if (usingQuote && cp == SINGLE_QUOTE) { 361 lastQuote = cp; 362 quoteStatus = START_QUOTE; 363 continue; 364 } 365 // normal literals 366 UTF16.append(buffer, cp); 367 } 368 // handle final cleanup 369 start = limit; 370 switch (quoteStatus) { 371 case HEX: 372 status = BROKEN_ESCAPE; 373 break; 374 case SLASH_START: 375 if (usingSlash) { 376 status = BROKEN_ESCAPE; 377 } else { 378 buffer.append(BACK_SLASH); 379 } 380 break; 381 case START_QUOTE: case NORMAL_QUOTE: 382 status = BROKEN_QUOTE; 383 break; 384 } 385 return status; 386 } 387 388 389} 390//eof 391