1/* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package java.util.regex; 18 19import java.io.Serializable; 20import java.util.ArrayList; 21import com.ibm.icu4jni.regex.NativeRegEx; 22 23/** 24 * Represents a pattern used for matching, searching, or replacing strings. 25 * {@code Pattern}s are specified in terms of regular expressions and compiled 26 * using an instance of this class. They are then used in conjunction with a 27 * {@link Matcher} to perform the actual search. 28 * <p/> 29 * A typical use case looks like this: 30 * <p/> 31 * <pre> 32 * Pattern p = Pattern.compile("Hello, A[a-z]*!"); 33 * 34 * Matcher m = p.matcher("Hello, Android!"); 35 * boolean b1 = m.matches(); // true 36 * 37 * m.setInput("Hello, Robot!"); 38 * boolean b2 = m.matches(); // false 39 * </pre> 40 * <p/> 41 * The above code could also be written in a more compact fashion, though this 42 * variant is less efficient, since {@code Pattern} and {@code Matcher} objects 43 * are created on the fly instead of being reused. 44 * fashion: 45 * <pre> 46 * boolean b1 = Pattern.matches("Hello, A[a-z]*!", "Hello, Android!"); // true 47 * boolean b2 = Pattern.matches("Hello, A[a-z]*!", "Hello, Robot!"); // false 48 * </pre> 49 * <p/> 50 * Please consult the <a href="package-descr.html">package documentation</a> for an 51 * overview of the regular expression syntax used in this class as well as 52 * Android-specific implementation details. 53 * 54 * @see Matcher 55 * @since Android 1.0 56 */ 57public final class Pattern implements Serializable { 58 59 private static final long serialVersionUID = 5073258162644648461L; 60 61 /** 62 * This constant specifies that a pattern matches Unix line endings ('\n') 63 * only against the '.', '^', and '$' meta characters. 64 */ 65 public static final int UNIX_LINES = 0x01; 66 67 /** 68 * This constant specifies that a {@code Pattern} is matched 69 * case-insensitively. That is, the patterns "a+" and "A+" would both match 70 * the string "aAaAaA". 71 * <p> 72 * Note: For Android, the {@code CASE_INSENSITIVE} constant 73 * (currently) always includes the meaning of the {@link #UNICODE_CASE} 74 * constant. So if case insensitivity is enabled, this automatically extends 75 * to all Unicode characters. The {@code UNICODE_CASE} constant itself has 76 * no special consequences. 77 */ 78 public static final int CASE_INSENSITIVE = 0x02; 79 80 /** 81 * This constant specifies that a {@code Pattern} may contain whitespace or 82 * comments. Otherwise comments and whitespace are taken as literal 83 * characters. 84 */ 85 public static final int COMMENTS = 0x04; 86 87 /** 88 * This constant specifies that the meta characters '^' and '$' match only 89 * the beginning and end end of an input line, respectively. Normally, they 90 * match the beginning and the end of the complete input. 91 */ 92 public static final int MULTILINE = 0x08; 93 94 /** 95 * This constant specifies that the whole {@code Pattern} is to be taken 96 * literally, that is, all meta characters lose their meanings. 97 */ 98 public static final int LITERAL = 0x10; 99 100 /** 101 * This constant specifies that the '.' meta character matches arbitrary 102 * characters, including line endings, which is normally not the case. 103 */ 104 public static final int DOTALL = 0x20; 105 106 /** 107 * This constant specifies that a {@code Pattern} is matched 108 * case-insensitively with regard to all Unicode characters. It is used in 109 * conjunction with the {@link #CASE_INSENSITIVE} constant to extend its 110 * meaning to all Unicode characters. 111 * <p> 112 * Note: For Android, the {@code CASE_INSENSITIVE} constant 113 * (currently) always includes the meaning of the {@code UNICODE_CASE} 114 * constant. So if case insensitivity is enabled, this automatically extends 115 * to all Unicode characters. The {@code UNICODE_CASE} constant then has no 116 * special consequences. 117 */ 118 public static final int UNICODE_CASE = 0x40; 119 120 /** 121 * This constant specifies that a character in a {@code Pattern} and a 122 * character in the input string only match if they are canonically 123 * equivalent. It is (currently) not supported in Android. 124 */ 125 public static final int CANON_EQ = 0x80; 126 127 /** 128 * Holds the regular expression. 129 */ 130 private String pattern; 131 132 /** 133 * Holds the flags used when compiling this pattern. 134 */ 135 private int flags; 136 137 /** 138 * Holds a handle (a pointer, actually) for the native ICU pattern. 139 */ 140 transient int mNativePattern; 141 142 /** 143 * Holds the number of groups in the pattern. 144 */ 145 transient int mGroupCount; 146 147 148 /** 149 * Returns a {@link Matcher} for the {@code Pattern} and a given input. The 150 * {@code Matcher} can be used to match the {@code Pattern} against the 151 * whole input, find occurrences of the {@code Pattern} in the input, or 152 * replace parts of the input. 153 * 154 * @param input 155 * the input to process. 156 * 157 * @return the resulting {@code Matcher}. 158 */ 159 public Matcher matcher(CharSequence input) { 160 return new Matcher(this, input); 161 } 162 163 /** 164 * Splits the given input sequence at occurrences of this {@code Pattern}. 165 * 166 * <p>If this {@code Pattern} does not occur in the input, the result is an 167 * array containing the input (converted from a {@code CharSequence} to 168 * a {@code String}). 169 * 170 * <p>Otherwise, the {@code limit} parameter controls the contents of the 171 * returned array as described below. 172 * 173 * @param inputSeq 174 * the input sequence. 175 * @param limit 176 * Determines the maximum number of entries in the resulting 177 * array, and the treatment of trailing empty strings. 178 * <ul> 179 * <li>For n > 0, the resulting array contains at most n 180 * entries. If this is fewer than the number of matches, the 181 * final entry will contain all remaining input. 182 * <li>For n < 0, the length of the resulting array is 183 * exactly the number of occurrences of the {@code Pattern} 184 * plus one for the text after the final separator. 185 * All entries are included. 186 * <li>For n == 0, the result is as for n < 0, except 187 * trailing empty strings will not be returned. (Note that 188 * the case where the input is itself an empty string is 189 * special, as described above, and the limit parameter does 190 * not apply there.) 191 * </ul> 192 * 193 * @return the resulting array. 194 */ 195 public String[] split(CharSequence inputSeq, int limit) { 196 if (inputSeq.length() == 0) { 197 // Unlike Perl, which considers the result of splitting the empty 198 // string to be the empty array, Java returns an array containing 199 // the empty string. 200 return new String[] { "" }; 201 } 202 203 int maxLength = limit <= 0 ? Integer.MAX_VALUE : limit; 204 205 String input = inputSeq.toString(); 206 ArrayList<String> list = new ArrayList<String>(); 207 208 Matcher matcher = new Matcher(this, inputSeq); 209 int savedPos = 0; 210 211 // Add text preceding each occurrence, if enough space. 212 while(matcher.find() && list.size() + 1 < maxLength) { 213 list.add(input.substring(savedPos, matcher.start())); 214 savedPos = matcher.end(); 215 } 216 217 // Add trailing text if enough space. 218 if (list.size() < maxLength) { 219 if (savedPos < input.length()) { 220 list.add(input.substring(savedPos)); 221 } else { 222 list.add(""); 223 } 224 } 225 226 // Remove trailing empty matches in the limit == 0 case. 227 if (limit == 0) { 228 int i = list.size() - 1; 229 while (i >= 0 && "".equals(list.get(i))) { 230 list.remove(i); 231 i--; 232 } 233 } 234 235 return list.toArray(new String[list.size()]); 236 } 237 238 /** 239 * Splits a given input around occurrences of a regular expression. This is 240 * a convenience method that is equivalent to calling the method 241 * {@link #split(java.lang.CharSequence, int)} with a limit of 0. 242 * 243 * @param input 244 * the input sequence. 245 * 246 * @return the resulting array. 247 */ 248 public String[] split(CharSequence input) { 249 return split(input, 0); 250 } 251 252 /** 253 * Returns the regular expression that was compiled into this 254 * {@code Pattern}. 255 * 256 * @return the regular expression. 257 */ 258 public String pattern() { 259 return pattern; 260 } 261 262 @Override 263 public String toString() { 264 return pattern; 265 } 266 267 /** 268 * Returns the flags that have been set for this {@code Pattern}. 269 * 270 * @return the flags that have been set. A combination of the constants 271 * defined in this class. 272 * 273 * @see #CANON_EQ 274 * @see #CASE_INSENSITIVE 275 * @see #COMMENTS 276 * @see #DOTALL 277 * @see #LITERAL 278 * @see #MULTILINE 279 * @see #UNICODE_CASE 280 * @see #UNIX_LINES 281 */ 282 public int flags() { 283 return flags; 284 } 285 286 /** 287 * Compiles a regular expression, creating a new {@code Pattern} instance in 288 * the process. Allows to set some flags that modify the behavior of the 289 * {@code Pattern}. 290 * 291 * @param pattern 292 * the regular expression. 293 * @param flags 294 * the flags to set. Basically, any combination of the constants 295 * defined in this class is valid. 296 * <p> 297 * Note: Currently, the {@link #CASE_INSENSITIVE} and 298 * {@link #UNICODE_CASE} constants have slightly special behavior 299 * in Android, and the {@link #CANON_EQ} constant is not 300 * supported at all. 301 * 302 * @return the new {@code Pattern} instance. 303 * 304 * @throws PatternSyntaxException 305 * if the regular expression is syntactically incorrect. 306 * 307 * @see #CANON_EQ 308 * @see #CASE_INSENSITIVE 309 * @see #COMMENTS 310 * @see #DOTALL 311 * @see #LITERAL 312 * @see #MULTILINE 313 * @see #UNICODE_CASE 314 * @see #UNIX_LINES 315 */ 316 public static Pattern compile(String pattern, int flags) throws PatternSyntaxException { 317 return new Pattern(pattern, flags); 318 } 319 320 /** 321 * Creates a new {@code Pattern} instance from a given regular expression 322 * and flags. 323 * 324 * @param pattern 325 * the regular expression. 326 * @param flags 327 * the flags to set. Any combination of the constants defined in 328 * this class is valid. 329 * 330 * @throws PatternSyntaxException 331 * if the regular expression is syntactically incorrect. 332 */ 333 private Pattern(String pattern, int flags) throws PatternSyntaxException { 334 if ((flags & CANON_EQ) != 0) { 335 throw new UnsupportedOperationException("CANON_EQ flag not supported"); 336 } 337 338 this.pattern = pattern; 339 this.flags = flags; 340 341 compileImpl(pattern, flags); 342 } 343 344 /** 345 * Compiles a regular expression, creating a new Pattern instance in the 346 * process. This is actually a convenience method that calls {@link 347 * #compile(String, int)} with a {@code flags} value of zero. 348 * 349 * @param pattern 350 * the regular expression. 351 * 352 * @return the new {@code Pattern} instance. 353 * 354 * @throws PatternSyntaxException 355 * if the regular expression is syntactically incorrect. 356 */ 357 public static Pattern compile(String pattern) { 358 return new Pattern(pattern, 0); 359 } 360 361 /** 362 * Compiles the given regular expression using the given flags. Used 363 * internally only. 364 * 365 * @param pattern 366 * the regular expression. 367 * @param flags 368 * the flags. 369 */ 370 private void compileImpl(String pattern, int flags) throws PatternSyntaxException { 371 if (pattern == null) { 372 throw new NullPointerException(); 373 } 374 375 if ((flags & LITERAL) != 0) { 376 pattern = quote(pattern); 377 } 378 379 // These are the flags natively supported by ICU. 380 // They even have the same value in native code. 381 flags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES); 382 383 mNativePattern = NativeRegEx.open(pattern, flags); 384 mGroupCount = NativeRegEx.groupCount(mNativePattern); 385 } 386 387 /** 388 * Tries to match a given regular expression against a given input. This is 389 * actually nothing but a convenience method that compiles the regular 390 * expression into a {@code Pattern}, builds a {@link Matcher} for it, and 391 * then does the match. If the same regular expression is used for multiple 392 * operations, it is recommended to compile it into a {@code Pattern} 393 * explicitly and request a reusable {@code Matcher}. 394 * 395 * @param regex 396 * the regular expression. 397 * @param input 398 * the input to process. 399 * 400 * @return true if and only if the {@code Pattern} matches the input. 401 * 402 * @see Pattern#compile(java.lang.String, int) 403 * @see Matcher#matches() 404 */ 405 public static boolean matches(String regex, CharSequence input) { 406 return new Matcher(new Pattern(regex, 0), input).matches(); 407 } 408 409 /** 410 * Quotes a given string using "\Q" and "\E", so that all other 411 * meta-characters lose their special meaning. If the string is used for a 412 * {@code Pattern} afterwards, it can only be matched literally. 413 * 414 * @param s 415 * the string to quote. 416 * 417 * @return the quoted string. 418 */ 419 public static String quote(String s) { 420 StringBuilder sb = new StringBuilder().append("\\Q"); //$NON-NLS-1$ 421 int apos = 0; 422 int k; 423 while ((k = s.indexOf("\\E", apos)) >= 0) { //$NON-NLS-1$ 424 sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q"); //$NON-NLS-1$ 425 apos = k + 2; 426 } 427 428 return sb.append(s.substring(apos)).append("\\E").toString(); //$NON-NLS-1$ 429 } 430 431 @Override 432 protected void finalize() throws Throwable { 433 try { 434 if (mNativePattern != 0) { 435 NativeRegEx.close(mNativePattern); 436 } 437 } 438 finally { 439 super.finalize(); 440 } 441 } 442 443 /** 444 * Serialization support 445 */ 446 private void readObject(java.io.ObjectInputStream s) 447 throws java.io.IOException, ClassNotFoundException { 448 s.defaultReadObject(); 449 450 compileImpl(pattern, flags); 451 } 452 453} 454