1/* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package java.util.regex; 18 19import java.io.IOException; 20import java.io.ObjectInputStream; 21import java.io.Serializable; 22 23/** 24 * Patterns are compiled regular expressions. In many cases, convenience methods such as 25 * {@link String#matches String.matches}, {@link String#replaceAll String.replaceAll} and 26 * {@link String#split String.split} will be preferable, but if you need to do a lot of work 27 * with the same regular expression, it may be more efficient to compile it once and reuse it. 28 * The {@code Pattern} class and its companion, {@link Matcher}, also offer more functionality 29 * than the small amount exposed by {@code String}. 30 * 31 * <pre> 32 * // String convenience methods: 33 * boolean sawFailures = s.matches("Failures: \\d+"); 34 * String farewell = s.replaceAll("Hello, (\\S+)", "Goodbye, $1"); 35 * String[] fields = s.split(":"); 36 * 37 * // Direct use of Pattern: 38 * Pattern p = Pattern.compile("Hello, (\\S+)"); 39 * Matcher m = p.matcher(inputString); 40 * while (m.find()) { // Find each match in turn; String can't do this. 41 * String name = m.group(1); // Access a submatch group; String can't do this. 42 * } 43 * </pre> 44 * 45 * <h3>Regular expression syntax</h3> 46 * <span class="datatable"> 47 * <style type="text/css"> 48 * .datatable td { padding-right: 20px; } 49 * </style> 50 * 51 * <p>Java supports a subset of Perl 5 regular expression syntax. An important gotcha is that Java 52 * has no regular expression literals, and uses plain old string literals instead. This means that 53 * you need an extra level of escaping. For example, the regular expression {@code \s+} has to 54 * be represented as the string {@code "\\s+"}. 55 * 56 * <h3>Escape sequences</h3> 57 * <p><table> 58 * <tr> <td> \ </td> <td>Quote the following metacharacter (so {@code \.} matches a literal {@code .}).</td> </tr> 59 * <tr> <td> \Q </td> <td>Quote all following metacharacters until {@code \E}.</td> </tr> 60 * <tr> <td> \E </td> <td>Stop quoting metacharacters (started by {@code \Q}).</td> </tr> 61 * <tr> <td> \\ </td> <td>A literal backslash.</td> </tr> 62 * <tr> <td> \u<i>hhhh</i> </td> <td>The Unicode character U+hhhh (in hex).</td> </tr> 63 * <tr> <td> \x<i>hh</i> </td> <td>The Unicode character U+00hh (in hex).</td> </tr> 64 * <tr> <td> \c<i>x</i> </td> <td>The ASCII control character ^x (so {@code \cH} would be ^H, U+0008).</td> </tr> 65 * 66 * <tr> <td> \a </td> <td>The ASCII bell character (U+0007).</td> </tr> 67 * <tr> <td> \e </td> <td>The ASCII ESC character (U+001b).</td> </tr> 68 * <tr> <td> \f </td> <td>The ASCII form feed character (U+000c).</td> </tr> 69 * <tr> <td> \n </td> <td>The ASCII newline character (U+000a).</td> </tr> 70 * <tr> <td> \r </td> <td>The ASCII carriage return character (U+000d).</td> </tr> 71 * <tr> <td> \t </td> <td>The ASCII tab character (U+0009).</td> </tr> 72 * </table> 73 * 74 * <h3>Character classes</h3> 75 * <p>It's possible to construct arbitrary character classes using set operations: 76 * <table> 77 * <tr> <td> [abc] </td> <td>Any one of {@code a}, {@code b}, or {@code c}. (Enumeration.)</td> </tr> 78 * <tr> <td> [a-c] </td> <td>Any one of {@code a}, {@code b}, or {@code c}. (Range.)</td> </tr> 79 * <tr> <td> [^abc] </td> <td>Any character <i>except</i> {@code a}, {@code b}, or {@code c}. (Negation.)</td> </tr> 80 * <tr> <td> [[a-f][0-9]] </td> <td>Any character in either range. (Union.)</td> </tr> 81 * <tr> <td> [[a-z]&&[jkl]] </td> <td>Any character in both ranges. (Intersection.)</td> </tr> 82 * </table> 83 * <p>Most of the time, the built-in character classes are more useful: 84 * <table> 85 * <tr> <td> \d </td> <td>Any digit character (see note below).</td> </tr> 86 * <tr> <td> \D </td> <td>Any non-digit character (see note below).</td> </tr> 87 * <tr> <td> \s </td> <td>Any whitespace character (see note below).</td> </tr> 88 * <tr> <td> \S </td> <td>Any non-whitespace character (see note below).</td> </tr> 89 * <tr> <td> \w </td> <td>Any word character (see note below).</td> </tr> 90 * <tr> <td> \W </td> <td>Any non-word character (see note below).</td> </tr> 91 * <tr> <td> \p{<i>NAME</i>} </td> <td> Any character in the class with the given <i>NAME</i>. </td> </tr> 92 * <tr> <td> \P{<i>NAME</i>} </td> <td> Any character <i>not</i> in the named class. </td> </tr> 93 * </table> 94 * <p>Note that these built-in classes don't just cover the traditional ASCII range. For example, 95 * <code>\w</code> is equivalent to the character class <code>[\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}]</code>. 96 * For more details see <a href="http://www.unicode.org/reports/tr18/#Compatibility_Properties">Unicode TR-18</a>, 97 * and bear in mind that the set of characters in each class can vary between Unicode releases. 98 * If you actually want to match only ASCII characters, specify the explicit characters you want; 99 * if you mean 0-9 use <code>[0-9]</code> rather than <code>\d</code>, which would also include 100 * Gurmukhi digits and so forth. 101 * <p>There are also a variety of named classes: 102 * <ul> 103 * <li><a href="../../lang/Character.html#unicode_categories">Unicode category names</a>, 104 * prefixed by {@code Is}. For example {@code \p{IsLu}} for all uppercase letters. 105 * <li>POSIX class names. These are 'Alnum', 'Alpha', 'ASCII', 'Blank', 'Cntrl', 'Digit', 106 * 'Graph', 'Lower', 'Print', 'Punct', 'Upper', 'XDigit'. 107 * <li>Unicode block names, as accepted as input to {@link java.lang.Character.UnicodeBlock#forName}, 108 * prefixed by {@code In}. For example {@code \p{InHebrew}} for all characters in the Hebrew block. 109 * <li>Character method names. These are all non-deprecated methods from {@link java.lang.Character} 110 * whose name starts with {@code is}, but with the {@code is} replaced by {@code java}. 111 * For example, {@code \p{javaLowerCase}}. 112 * </ul> 113 * 114 * <h3>Quantifiers</h3> 115 * <p>Quantifiers match some number of instances of the preceding regular expression. 116 * <table> 117 * <tr> <td> * </td> <td>Zero or more.</td> </tr> 118 * <tr> <td> ? </td> <td>Zero or one.</td> </tr> 119 * <tr> <td> + </td> <td>One or more.</td> </tr> 120 * <tr> <td> {<i>n</i>} </td> <td>Exactly <i>n</i>.</td> </tr> 121 * <tr> <td> {<i>n,</i>} </td> <td>At least <i>n</i>.</td> </tr> 122 * <tr> <td> {<i>n</i>,<i>m</i>} </td> <td>At least <i>n</i> but not more than <i>m</i>.</td> </tr> 123 * </table> 124 * <p>Quantifiers are "greedy" by default, meaning that they will match the longest possible input 125 * sequence. There are also non-greedy quantifiers that match the shortest possible input sequence. 126 * They're same as the greedy ones but with a trailing {@code ?}: 127 * <table> 128 * <tr> <td> *? </td> <td>Zero or more (non-greedy).</td> </tr> 129 * <tr> <td> ?? </td> <td>Zero or one (non-greedy).</td> </tr> 130 * <tr> <td> +? </td> <td>One or more (non-greedy).</td> </tr> 131 * <tr> <td> {<i>n</i>}? </td> <td>Exactly <i>n</i> (non-greedy).</td> </tr> 132 * <tr> <td> {<i>n,</i>}? </td> <td>At least <i>n</i> (non-greedy).</td> </tr> 133 * <tr> <td> {<i>n</i>,<i>m</i>}? </td> <td>At least <i>n</i> but not more than <i>m</i> (non-greedy).</td> </tr> 134 * </table> 135 * <p>Quantifiers allow backtracking by default. There are also possessive quantifiers to prevent 136 * backtracking. They're same as the greedy ones but with a trailing {@code +}: 137 * <table> 138 * <tr> <td> *+ </td> <td>Zero or more (possessive).</td> </tr> 139 * <tr> <td> ?+ </td> <td>Zero or one (possessive).</td> </tr> 140 * <tr> <td> ++ </td> <td>One or more (possessive).</td> </tr> 141 * <tr> <td> {<i>n</i>}+ </td> <td>Exactly <i>n</i> (possessive).</td> </tr> 142 * <tr> <td> {<i>n,</i>}+ </td> <td>At least <i>n</i> (possessive).</td> </tr> 143 * <tr> <td> {<i>n</i>,<i>m</i>}+ </td> <td>At least <i>n</i> but not more than <i>m</i> (possessive).</td> </tr> 144 * </table> 145 * 146 * <h3>Zero-width assertions</h3> 147 * <p><table> 148 * <tr> <td> ^ </td> <td>At beginning of line.</td> </tr> 149 * <tr> <td> $ </td> <td>At end of line.</td> </tr> 150 * <tr> <td> \A </td> <td>At beginning of input.</td> </tr> 151 * <tr> <td> \b </td> <td>At word boundary.</td> </tr> 152 * <tr> <td> \B </td> <td>At non-word boundary.</td> </tr> 153 * <tr> <td> \G </td> <td>At end of previous match.</td> </tr> 154 * <tr> <td> \z </td> <td>At end of input.</td> </tr> 155 * <tr> <td> \Z </td> <td>At end of input, or before newline at end.</td> </tr> 156 * </table> 157 * 158 * <h3>Look-around assertions</h3> 159 * <p>Look-around assertions assert that the subpattern does (positive) or doesn't (negative) match 160 * after (look-ahead) or before (look-behind) the current position, without including the matched 161 * text in the containing match. The maximum length of possible matches for look-behind patterns 162 * must not be unbounded. 163 * <p><table> 164 * <tr> <td> (?=<i>a</i>) </td> <td>Zero-width positive look-ahead.</td> </tr> 165 * <tr> <td> (?!<i>a</i>) </td> <td>Zero-width negative look-ahead.</td> </tr> 166 * <tr> <td> (?<=<i>a</i>) </td> <td>Zero-width positive look-behind.</td> </tr> 167 * <tr> <td> (?<!<i>a</i>) </td> <td>Zero-width negative look-behind.</td> </tr> 168 * </table> 169 * 170 * <h3>Groups</h3> 171 * 172 * <p><table> 173 * <tr> <td> (<i>a</i>) </td> <td>A capturing group.</td> </tr> 174 * <tr> <td> (?:<i>a</i>) </td> <td>A non-capturing group.</td> </tr> 175 * <tr> <td> (?><i>a</i>) </td> <td>An independent non-capturing group. (The first match of the subgroup is the only match tried.)</td> </tr> 176 * <tr> <td> \<i>n</i> </td> <td>The text already matched by capturing group <i>n</i>.</td> </tr> 177 * </table> 178 * <p>See {@link Matcher#group} for details of how capturing groups are numbered and accessed. 179 * 180 * <h3>Operators</h3> 181 * <p><table> 182 * <tr> <td> <i>ab</i> </td> <td>Expression <i>a</i> followed by expression <i>b</i>.</td> </tr> 183 * <tr> <td> <i>a</i>|<i>b</i> </td> <td>Either expression <i>a</i> or expression <i>b</i>.</td> </tr> 184 * </table> 185 * 186 * <a name="flags"><h3>Flags</h3></a> 187 * <p><table> 188 * <tr> <td> (?dimsux-dimsux:<i>a</i>) </td> <td>Evaluates the expression <i>a</i> with the given flags enabled/disabled.</td> </tr> 189 * <tr> <td> (?dimsux-dimsux) </td> <td>Evaluates the rest of the pattern with the given flags enabled/disabled.</td> </tr> 190 * </table> 191 * 192 * <p>The flags are: 193 * <table> 194 * <tr><td>{@code i}</td> <td>{@link #CASE_INSENSITIVE}</td> <td>case insensitive matching</td></tr> 195 * <tr><td>{@code d}</td> <td>{@link #UNIX_LINES}</td> <td>only accept {@code '\n'} as a line terminator</td></tr> 196 * <tr><td>{@code m}</td> <td>{@link #MULTILINE}</td> <td>allow {@code ^} and {@code $} to match beginning/end of any line</td></tr> 197 * <tr><td>{@code s}</td> <td>{@link #DOTALL}</td> <td>allow {@code .} to match {@code '\n'} ("s" for "single line")</td></tr> 198 * <tr><td>{@code u}</td> <td>{@link #UNICODE_CASE}</td> <td>enable Unicode case folding</td></tr> 199 * <tr><td>{@code x}</td> <td>{@link #COMMENTS}</td> <td>allow whitespace and comments</td></tr> 200 * </table> 201 * <p>Either set of flags may be empty. For example, {@code (?i-m)} would turn on case-insensitivity 202 * and turn off multiline mode, {@code (?i)} would just turn on case-insensitivity, 203 * and {@code (?-m)} would just turn off multiline mode. 204 * <p>Note that on Android, {@code UNICODE_CASE} is always on: case-insensitive matching will 205 * always be Unicode-aware. 206 * <p>There are two other flags not settable via this mechanism: {@link #CANON_EQ} and 207 * {@link #LITERAL}. Attempts to use {@link #CANON_EQ} on Android will throw an exception. 208 * </span> 209 * 210 * <h3>Implementation notes</h3> 211 * 212 * <p>The regular expression implementation used in Android is provided by 213 * <a href="http://www.icu-project.org">ICU</a>. The notation for the regular 214 * expressions is mostly a superset of those used in other Java language 215 * implementations. This means that existing applications will normally work as 216 * expected, but in rare cases Android may accept a regular expression that is 217 * not accepted by other implementations. 218 * 219 * <p>In some cases, Android will recognize that a regular expression is a simple 220 * special case that can be handled more efficiently. This is true of both the convenience methods 221 * in {@code String} and the methods in {@code Pattern}. 222 * 223 * @see Matcher 224 */ 225public final class Pattern implements Serializable { 226 227 private static final long serialVersionUID = 5073258162644648461L; 228 229 /** 230 * This constant specifies that a pattern matches Unix line endings ('\n') 231 * only against the '.', '^', and '$' meta characters. Corresponds to {@code (?d)}. 232 */ 233 public static final int UNIX_LINES = 0x01; 234 235 /** 236 * This constant specifies that a {@code Pattern} is matched 237 * case-insensitively. That is, the patterns "a+" and "A+" would both match 238 * the string "aAaAaA". See {@link #UNICODE_CASE}. Corresponds to {@code (?i)}. 239 */ 240 public static final int CASE_INSENSITIVE = 0x02; 241 242 /** 243 * This constant specifies that a {@code Pattern} may contain whitespace or 244 * comments. Otherwise comments and whitespace are taken as literal 245 * characters. Corresponds to {@code (?x)}. 246 */ 247 public static final int COMMENTS = 0x04; 248 249 /** 250 * This constant specifies that the meta characters '^' and '$' match only 251 * the beginning and end of an input line, respectively. Normally, they 252 * match the beginning and the end of the complete input. Corresponds to {@code (?m)}. 253 */ 254 public static final int MULTILINE = 0x08; 255 256 /** 257 * This constant specifies that the whole {@code Pattern} is to be taken 258 * literally, that is, all meta characters lose their meanings. 259 */ 260 public static final int LITERAL = 0x10; 261 262 /** 263 * This constant specifies that the '.' meta character matches arbitrary 264 * characters, including line endings, which is normally not the case. 265 * Corresponds to {@code (?s)}. 266 */ 267 public static final int DOTALL = 0x20; 268 269 /** 270 * This constant specifies that a {@code Pattern} that uses case-insensitive matching 271 * will use Unicode case folding. On Android, {@code UNICODE_CASE} is always on: 272 * case-insensitive matching will always be Unicode-aware. If your code is intended to 273 * be portable and uses case-insensitive matching on non-ASCII characters, you should 274 * use this flag. Corresponds to {@code (?u)}. 275 */ 276 public static final int UNICODE_CASE = 0x40; 277 278 /** 279 * This constant specifies that a character in a {@code Pattern} and a 280 * character in the input string only match if they are canonically 281 * equivalent. It is (currently) not supported in Android. 282 */ 283 public static final int CANON_EQ = 0x80; 284 285 private final String pattern; 286 private final int flags; 287 288 transient long address; 289 290 /** 291 * Returns a {@link Matcher} for this pattern applied to the given {@code input}. 292 * The {@code Matcher} can be used to match the {@code Pattern} against the 293 * whole input, find occurrences of the {@code Pattern} in the input, or 294 * replace parts of the input. 295 */ 296 public Matcher matcher(CharSequence input) { 297 return new Matcher(this, input); 298 } 299 300 /** 301 * Splits the given {@code input} at occurrences of this pattern. 302 * 303 * <p>If this pattern does not occur in the input, the result is an 304 * array containing the input (converted from a {@code CharSequence} to 305 * a {@code String}). 306 * 307 * <p>Otherwise, the {@code limit} parameter controls the contents of the 308 * returned array as described below. 309 * 310 * @param limit 311 * Determines the maximum number of entries in the resulting 312 * array, and the treatment of trailing empty strings. 313 * <ul> 314 * <li>For n > 0, the resulting array contains at most n 315 * entries. If this is fewer than the number of matches, the 316 * final entry will contain all remaining input. 317 * <li>For n < 0, the length of the resulting array is 318 * exactly the number of occurrences of the {@code Pattern} 319 * plus one for the text after the final separator. 320 * All entries are included. 321 * <li>For n == 0, the result is as for n < 0, except 322 * trailing empty strings will not be returned. (Note that 323 * the case where the input is itself an empty string is 324 * special, as described above, and the limit parameter does 325 * not apply there.) 326 * </ul> 327 */ 328 public String[] split(CharSequence input, int limit) { 329 return Splitter.split(this, pattern, input.toString(), limit); 330 } 331 332 /** 333 * Equivalent to {@code split(input, 0)}. 334 */ 335 public String[] split(CharSequence input) { 336 return split(input, 0); 337 } 338 339 /** 340 * Returns the regular expression supplied to {@code compile}. 341 */ 342 public String pattern() { 343 return pattern; 344 } 345 346 @Override 347 public String toString() { 348 return pattern; 349 } 350 351 /** 352 * Returns the flags supplied to {@code compile}. 353 */ 354 public int flags() { 355 return flags; 356 } 357 358 /** 359 * Returns a compiled form of the given {@code regularExpression}, as modified by the 360 * given {@code flags}. See the <a href="#flags">flags overview</a> for more on flags. 361 * 362 * @throws PatternSyntaxException if the regular expression is syntactically incorrect. 363 * 364 * @see #CANON_EQ 365 * @see #CASE_INSENSITIVE 366 * @see #COMMENTS 367 * @see #DOTALL 368 * @see #LITERAL 369 * @see #MULTILINE 370 * @see #UNICODE_CASE 371 * @see #UNIX_LINES 372 */ 373 public static Pattern compile(String regularExpression, int flags) throws PatternSyntaxException { 374 return new Pattern(regularExpression, flags); 375 } 376 377 /** 378 * Equivalent to {@code Pattern.compile(pattern, 0)}. 379 */ 380 public static Pattern compile(String pattern) { 381 return new Pattern(pattern, 0); 382 } 383 384 private Pattern(String pattern, int flags) throws PatternSyntaxException { 385 if ((flags & CANON_EQ) != 0) { 386 throw new UnsupportedOperationException("CANON_EQ flag not supported"); 387 } 388 int supportedFlags = CASE_INSENSITIVE | COMMENTS | DOTALL | LITERAL | MULTILINE | UNICODE_CASE | UNIX_LINES; 389 if ((flags & ~supportedFlags) != 0) { 390 throw new IllegalArgumentException("Unsupported flags: " + (flags & ~supportedFlags)); 391 } 392 this.pattern = pattern; 393 this.flags = flags; 394 compile(); 395 } 396 397 private void compile() throws PatternSyntaxException { 398 if (pattern == null) { 399 throw new NullPointerException("pattern == null"); 400 } 401 402 String icuPattern = pattern; 403 if ((flags & LITERAL) != 0) { 404 icuPattern = quote(pattern); 405 } 406 407 // These are the flags natively supported by ICU. 408 // They even have the same value in native code. 409 int icuFlags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES); 410 411 address = compileImpl(icuPattern, icuFlags); 412 } 413 414 /** 415 * Tests whether the given {@code regularExpression} matches the given {@code input}. 416 * Equivalent to {@code Pattern.compile(regularExpression).matcher(input).matches()}. 417 * If the same regular expression is to be used for multiple operations, it may be more 418 * efficient to reuse a compiled {@code Pattern}. 419 * 420 * @see Pattern#compile(java.lang.String, int) 421 * @see Matcher#matches() 422 */ 423 public static boolean matches(String regularExpression, CharSequence input) { 424 return new Matcher(new Pattern(regularExpression, 0), input).matches(); 425 } 426 427 /** 428 * Quotes the given {@code string} using "\Q" and "\E", so that all 429 * meta-characters lose their special meaning. This method correctly 430 * escapes embedded instances of "\Q" or "\E". If the entire result 431 * is to be passed verbatim to {@link #compile}, it's usually clearer 432 * to use the {@link #LITERAL} flag instead. 433 */ 434 public static String quote(String string) { 435 StringBuilder sb = new StringBuilder(); 436 sb.append("\\Q"); 437 int apos = 0; 438 int k; 439 while ((k = string.indexOf("\\E", apos)) >= 0) { 440 sb.append(string.substring(apos, k + 2)).append("\\\\E\\Q"); 441 apos = k + 2; 442 } 443 return sb.append(string.substring(apos)).append("\\E").toString(); 444 } 445 446 @Override protected void finalize() throws Throwable { 447 try { 448 closeImpl(address); 449 } finally { 450 super.finalize(); 451 } 452 } 453 454 private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException { 455 s.defaultReadObject(); 456 compile(); 457 } 458 459 private static native void closeImpl(long addr); 460 private static native long compileImpl(String regex, int flags); 461} 462