1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3 ******************************************************************************* 4 * Copyright (C) 2009-2015, Google, International Business Machines Corporation 5 * and others. All Rights Reserved. 6 ******************************************************************************* 7 */ 8package android.icu.impl; 9 10import java.io.BufferedReader; 11import java.io.FileInputStream; 12import java.io.IOException; 13import java.io.InputStream; 14import java.io.InputStreamReader; 15import java.io.UnsupportedEncodingException; 16import java.text.ParsePosition; 17import java.util.Arrays; 18import java.util.Comparator; 19import java.util.LinkedHashSet; 20import java.util.List; 21import java.util.Map; 22import java.util.Map.Entry; 23import java.util.Set; 24import java.util.TreeMap; 25import java.util.regex.Pattern; 26 27import android.icu.text.StringTransform; 28import android.icu.text.SymbolTable; 29import android.icu.text.UnicodeSet; 30import android.icu.util.Freezable; 31 32/** 33 * Contains utilities to supplement the JDK Regex, since it doesn't handle 34 * Unicode well. 35 * 36 * <p>TODO: Move to android.icu.dev.somewhere. 37 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools. 38 * 39 * @author markdavis 40 * @hide Only a subset of ICU is exposed in Android 41 */ 42public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { 43 // Note: we don't currently have any state, but intend to in the future, 44 // particularly for the regex style supported. 45 46 private SymbolTable symbolTable; 47 48 /** 49 * Set the symbol table for internal processing 50 * @hide draft / provisional / internal are hidden on Android 51 */ 52 public SymbolTable getSymbolTable() { 53 return symbolTable; 54 } 55 56 /** 57 * Get the symbol table for internal processing 58 * @hide draft / provisional / internal are hidden on Android 59 */ 60 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { 61 this.symbolTable = symbolTable; 62 return this; 63 } 64 65 /** 66 * Adds full Unicode property support, with the latest version of Unicode, 67 * to Java Regex, bringing it up to Level 1 (see 68 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the 69 * regex pattern string and interpreting the character classes (\p{...}, 70 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With 71 * this utility, Java regex expressions can be updated to work with the 72 * latest version of Unicode, and with all Unicode properties. Note that the 73 * UnicodeSet syntax has not yet, however, been updated to be completely 74 * consistent with Java regex, so be careful of the differences. 75 * <p>Not thread-safe; create a separate copy for different threads. 76 * <p>In the future, we may extend this to support other regex packages. 77 * 78 * @regex A modified Java regex pattern, as in the input to 79 * Pattern.compile(), except that all "character classes" are 80 * processed as if they were UnicodeSet patterns. Example: 81 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. 82 * @return A processed Java regex pattern, suitable for input to 83 * Pattern.compile(). 84 */ 85 public String transform(String regex) { 86 StringBuilder result = new StringBuilder(); 87 UnicodeSet temp = new UnicodeSet(); 88 ParsePosition pos = new ParsePosition(0); 89 int state = 0; // 1 = after \ 90 91 // We add each character unmodified to the output, unless we have a 92 // UnicodeSet. Note that we don't worry about supplementary characters, 93 // since none of the syntax uses them. 94 95 for (int i = 0; i < regex.length(); ++i) { 96 // look for UnicodeSets, allowing for quoting with \ and \Q 97 char ch = regex.charAt(i); 98 switch (state) { 99 case 0: // we only care about \, and '['. 100 if (ch == '\\') { 101 if (UnicodeSet.resemblesPattern(regex, i)) { 102 // should only happen with \p 103 i = processSet(regex, i, result, temp, pos); 104 continue; 105 } 106 state = 1; 107 } else if (ch == '[') { 108 // if we have what looks like a UnicodeSet 109 if (UnicodeSet.resemblesPattern(regex, i)) { 110 i = processSet(regex, i, result, temp, pos); 111 continue; 112 } 113 } 114 break; 115 116 case 1: // we are after a \ 117 if (ch == 'Q') { 118 state = 1; 119 } else { 120 state = 0; 121 } 122 break; 123 124 case 2: // we are in a \Q... 125 if (ch == '\\') { 126 state = 3; 127 } 128 break; 129 130 case 3: // we are in at \Q...\ 131 if (ch == 'E') { 132 state = 0; 133 } 134 state = 2; 135 break; 136 } 137 result.append(ch); 138 } 139 return result.toString(); 140 } 141 142 /** 143 * Convenience static function, using standard parameters. 144 * @param regex as in process() 145 * @return processed regex pattern, as in process() 146 */ 147 public static String fix(String regex) { 148 return STANDARD.transform(regex); 149 } 150 151 /** 152 * Compile a regex string, after processing by fix(...). 153 * 154 * @param regex Raw regex pattern, as in fix(...). 155 * @return Pattern 156 */ 157 public static Pattern compile(String regex) { 158 return Pattern.compile(STANDARD.transform(regex)); 159 } 160 161 /** 162 * Compile a regex string, after processing by fix(...). 163 * 164 * @param regex Raw regex pattern, as in fix(...). 165 * @return Pattern 166 */ 167 public static Pattern compile(String regex, int options) { 168 return Pattern.compile(STANDARD.transform(regex), options); 169 } 170 171 /** 172 * Compile a composed string from a set of BNF lines; see the List version for more information. 173 * 174 * @param bnfLines Series of BNF lines. 175 * @return Pattern 176 */ 177 public String compileBnf(String bnfLines) { 178 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); 179 } 180 181 /** 182 * Compile a composed string from a set of BNF lines, such as for composing a regex 183 * expression. The lines can be in any order, but there must not be any 184 * cycles. The result can be used as input for fix(). 185 * <p> 186 * Example: 187 * <pre> 188 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; 189 * scheme = reserved+; 190 * host = // reserved+; 191 * query = [\\=reserved]+; 192 * fragment = reserved+; 193 * reserved = [[:ascii:][:alphabetic:]]; 194 * </pre> 195 * <p> 196 * Caveats: at this point the parsing is simple; for example, # cannot be 197 * quoted (use \\u0023); you can set it to null to disable. 198 * The equality sign and a few others can be reset with 199 * setBnfX(). 200 * 201 * @param lines Series of lines that represent a BNF expression. The lines contain 202 * a series of statements that of the form x=y;. A statement can take 203 * multiple lines, but there can't be multiple statements on a line. 204 * A hash quotes to the end of the line. 205 * @return Pattern 206 */ 207 public String compileBnf(List<String> lines) { 208 Map<String, String> variables = getVariables(lines); 209 Set<String> unused = new LinkedHashSet<String>(variables.keySet()); 210 // brute force replacement; do twice to allow for different order 211 // later on can optimize 212 for (int i = 0; i < 2; ++i) { 213 for (Entry<String, String> entry : variables.entrySet()) { 214 String variable = entry.getKey(), 215 definition = entry.getValue(); 216 217 for (Entry<String, String> entry2 : variables.entrySet()) { 218 String variable2 = entry2.getKey(), 219 definition2 = entry2.getValue(); 220 if (variable.equals(variable2)) { 221 continue; 222 } 223 String altered2 = definition2.replace(variable, definition); 224 if (!altered2.equals(definition2)) { 225 unused.remove(variable); 226 variables.put(variable2, altered2); 227 if (log != null) { 228 try { 229 log.append(variable2 + "=" + altered2 + ";"); 230 } catch (IOException e) { 231 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); 232 } 233 } 234 } 235 } 236 } 237 } 238 if (unused.size() != 1) { 239 throw new IllegalArgumentException("Not a single root: " + unused); 240 } 241 return variables.get(unused.iterator().next()); 242 } 243 244 public String getBnfCommentString() { 245 return bnfCommentString; 246 } 247 248 public void setBnfCommentString(String bnfCommentString) { 249 this.bnfCommentString = bnfCommentString; 250 } 251 252 public String getBnfVariableInfix() { 253 return bnfVariableInfix; 254 } 255 256 public void setBnfVariableInfix(String bnfVariableInfix) { 257 this.bnfVariableInfix = bnfVariableInfix; 258 } 259 260 public String getBnfLineSeparator() { 261 return bnfLineSeparator; 262 } 263 264 public void setBnfLineSeparator(String bnfLineSeparator) { 265 this.bnfLineSeparator = bnfLineSeparator; 266 } 267 268 /** 269 * Utility for loading lines from a file. 270 * @param result The result of the appended lines. 271 * @param file The file to have an input stream. 272 * @param encoding if null, then UTF-8 273 * @return filled list 274 * @throws IOException If there were problems opening the file for input stream. 275 */ 276 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { 277 InputStream is = new FileInputStream(file); 278 try { 279 return appendLines(result, is, encoding); 280 } finally { 281 is.close(); 282 } 283 } 284 285 /** 286 * Utility for loading lines from a UTF8 file. 287 * @param result The result of the appended lines. 288 * @param inputStream The input stream. 289 * @param encoding if null, then UTF-8 290 * @return filled list 291 * @throws IOException If there were problems opening the input stream for reading. 292 */ 293 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) 294 throws UnsupportedEncodingException, IOException { 295 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); 296 while (true) { 297 String line = in.readLine(); 298 if (line == null) break; 299 result.add(line); 300 } 301 return result; 302 } 303 304 305 306 /* (non-Javadoc) 307 * @see android.icu.util.Freezable#cloneAsThawed() 308 */ 309 public UnicodeRegex cloneAsThawed() { 310 // TODO Auto-generated method stub 311 try { 312 return (UnicodeRegex)clone(); 313 } catch (CloneNotSupportedException e) { 314 throw new IllegalArgumentException(); // should never happen 315 } 316 } 317 318 /* (non-Javadoc) 319 * @see android.icu.util.Freezable#freeze() 320 */ 321 public UnicodeRegex freeze() { 322 // no action needed now. 323 return this; 324 } 325 326 /* (non-Javadoc) 327 * @see android.icu.util.Freezable#isFrozen() 328 */ 329 public boolean isFrozen() { 330 // at this point, always true 331 return true; 332 } 333 334 // ===== PRIVATES ===== 335 336 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { 337 try { 338 pos.setIndex(i); 339 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); 340 x.complement().complement(); // hack to fix toPattern 341 result.append(x.toPattern(false)); 342 i = pos.getIndex() - 1; // allow for the loop increment 343 return i; 344 } catch (Exception e) { 345 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); 346 } 347 } 348 349 private static UnicodeRegex STANDARD = new UnicodeRegex(); 350 private String bnfCommentString = "#"; 351 private String bnfVariableInfix = "="; 352 private String bnfLineSeparator = "\n"; 353 private Appendable log = null; 354 355 private Comparator<Object> LongestFirst = new Comparator<Object>() { 356 public int compare(Object obj0, Object obj1) { 357 String arg0 = obj0.toString(); 358 String arg1 = obj1.toString(); 359 int len0 = arg0.length(); 360 int len1 = arg1.length(); 361 if (len0 != len1) return len1 - len0; 362 return arg0.compareTo(arg1); 363 } 364 }; 365 366 private Map<String, String> getVariables(List<String> lines) { 367 Map<String, String> variables = new TreeMap<String, String>(LongestFirst); 368 String variable = null; 369 StringBuffer definition = new StringBuffer(); 370 int count = 0; 371 for (String line : lines) { 372 ++count; 373 // remove initial bom, comments 374 if (line.length() == 0) continue; 375 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 376 377 if (bnfCommentString != null) { 378 int hashPos = line.indexOf(bnfCommentString); 379 if (hashPos >= 0) line = line.substring(0, hashPos); 380 } 381 String trimline = line.trim(); 382 if (trimline.length() == 0) continue; 383 384 // String[] lineParts = line.split(";"); 385 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 386 if (linePart.trim().length() == 0) continue; 387 boolean terminated = trimline.endsWith(";"); 388 if (terminated) { 389 linePart = linePart.substring(0,linePart.lastIndexOf(';')); 390 } 391 int equalsPos = linePart.indexOf(bnfVariableInfix); 392 if (equalsPos >= 0) { 393 if (variable != null) { 394 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 395 } 396 variable = linePart.substring(0,equalsPos).trim(); 397 if (variables.containsKey(variable)) { 398 throw new IllegalArgumentException("Duplicate variable definition in " + line); 399 } 400 definition.append(linePart.substring(equalsPos+1).trim()); 401 } else { // no equals, so 402 if (variable == null) { 403 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 404 } 405 definition.append(bnfLineSeparator).append(linePart); 406 } 407 // we are terminated if i is not at the end, or the line ends with a ; 408 if (terminated) { 409 variables.put(variable, definition.toString()); 410 variable = null; // signal we have no variable 411 definition.setLength(0); 412 } 413 } 414 if (variable != null) { 415 throw new IllegalArgumentException("Missing ';' at end"); 416 } 417 return variables; 418 } 419} 420