Pattern.java revision 5f37da05bb48298568f8abd7c97c3d11552e1867
1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package java.util.regex;
18
19import java.io.IOException;
20import java.io.ObjectInputStream;
21import java.io.Serializable;
22import java.util.ArrayList;
23import com.ibm.icu4jni.regex.NativeRegEx;
24
25/**
26 * Patterns are compiled regular expressions. In many cases, convenience methods such as
27 * {@link String#matches String.matches}, {@link String#replaceAll String.replaceAll} and
28 * {@link String#split String.split} will be preferable, but if you need to do a lot of work
29 * with the same regular expression, it may be more efficient to compile it once and reuse it.
30 * The {@code Pattern} class and its companion, {@link Matcher}, are also a lot more powerful
31 * than the small amount of functionality exposed by {@code String}.
32 *
33 * <pre>
34 * // String convenience methods:
35 * boolean sawFailures = s.matches("Failures: \d+");
36 * String farewell = s.replaceAll("Hello, (\S+)", "Goodbye, $1");
37 * String[] fields = s.split(":");
38 *
39 * // Direct use of Pattern:
40 * Pattern p = Pattern.compile("Hello, (\S+)");
41 * Matcher m = p.matcher(inputString);
42 * while (m.find()) { // Find each match in turn; String can't do this.
43 *     String name = m.group(1); // Access a submatch group; String can't do this.
44 * }
45 * </pre>
46 *
47 * <h3>Regular expression syntax</h3>
48 * <span class="datatable">
49 * <style type="text/css">
50 * .datatable td { padding-right: 20px; }
51 * </style>
52 *
53 * <p>Java supports a subset of Perl 5 regular expression syntax. An important gotcha is that Java
54 * has no regular expression literals, and uses plain old string literals instead. This means that
55 * you need an extra level of escaping. For example, the regular expression {@code \s+} has to
56 * be represented as the string {@code "\\s+"}.
57 *
58 * <h3>Escape sequences</h3>
59 * <p><table>
60 * <tr> <td> \ </td> <td>Quote the following metacharacter (so {@code \.} matches a literal {@code .}).</td> </tr>
61 * <tr> <td> \Q </td> <td>Quote all following metacharacters until {@code \E}.</td> </tr>
62 * <tr> <td> \E </td> <td>Stop quoting metacharacters (started by {@code \Q}).</td> </tr>
63 * <tr> <td> \\ </td> <td>A literal backslash.</td> </tr>
64 * <tr> <td> &#x005c;<i>hhhh</i> </td> <td>The Unicode character U+hhhh (in hex).</td> </tr>
65 * <tr> <td> \c<i>x</i> </td> <td>The ASCII control character <i>x</i> (so {@code \cI} would be U+0009).</td> </tr>
66 *
67 * <tr> <td> \a </td> <td>The ASCII bell character (U+0007).</td> </tr>
68 * <tr> <td> \e </td> <td>The ASCII ESC character (U+001b).</td> </tr>
69 * <tr> <td> \f </td> <td>The ASCII form feed character (U+000c).</td> </tr>
70 * <tr> <td> \n </td> <td>The ASCII newline character (U+000a).</td> </tr>
71 * <tr> <td> \r </td> <td>The ASCII carriage return character (U+000d).</td> </tr>
72 * <tr> <td> \t </td> <td>The ASCII tab character (U+0009).</td> </tr>
73 * </table>
74 *
75 * <h3>Character classes</h3>
76 * <p>It's possible to construct arbitrary character classes using set operations:
77 * <table>
78 * <tr> <td> [abc] </td> <td>Any one of {@code a}, {@code b}, or {@code c}. (Enumeration.)</td> </tr>
79 * <tr> <td> [a-c] </td> <td>Any one of {@code a}, {@code b}, or {@code c}. (Range.)</td> </tr>
80 * <tr> <td> [^abc] </td> <td>Any character <i>except</i> {@code a}, {@code b}, or {@code c}. (Negation.)</td> </tr>
81 * <tr> <td> [[a-f][0-9]] </td> <td>Any character in either range. (Union.)</td> </tr>
82 * <tr> <td> [[a-z]&&[jkl]] </td> <td>Any character in both ranges. (Intersection.)</td> </tr>
83 * </table>
84 * <p>Most of the time, the built-in character classes are more useful:
85 * <table>
86 * <tr> <td> \d </td> <td>Any digit character.</td> </tr>
87 * <tr> <td> \D </td> <td>Any non-digit character.</td> </tr>
88 * <tr> <td> \s </td> <td>Any whitespace character.</td> </tr>
89 * <tr> <td> \S </td> <td>Any non-whitespace character.</td> </tr>
90 * <tr> <td> \w </td> <td>Any word character.</td> </tr>
91 * <tr> <td> \W </td> <td>Any non-word character.</td> </tr>
92 * <tr> <td> \p{<i>NAME</i>} </td> <td> Any character in the class with the given <i>NAME</i>. </td> </tr>
93 * <tr> <td> \P{<i>NAME</i>} </td> <td> Any character <i>not</i> in the named class. </td> </tr>
94 * </table>
95 * <p>There are a variety of named classes:
96 * <ul>
97 * <li><a href="../../lang/Character.html#unicode_categories">Unicode category names</a>,
98 * prefixed by {@code Is}. For example {@code \p{IsLu}} for all uppercase letters.
99 * <li>POSIX class names. These are 'Alnum', 'Alpha', 'ASCII', 'Blank', 'Cntrl', 'Digit',
100 * 'Graph', 'Lower', 'Print', 'Punct', 'Upper', 'XDigit'.
101 * <li>Unicode block names, as used by {@link java.lang.Character.UnicodeBlock#forName} prefixed
102 * by {@code In}. For example {@code \p{InHebrew}} for all characters in the Hebrew block.
103 * <li>Character method names. These are all non-deprecated methods from {@link java.lang.Character}
104 * whose name starts with {@code is}, but with the {@code is} replaced by {@code java}.
105 * For example, {@code \p{javaLowerCase}}.
106 * </ul>
107 *
108 * <h3>Quantifiers</h3>
109 * <p>Quantifiers match some number of instances of the preceding regular expression.
110 * <table>
111 * <tr> <td> * </td> <td>Zero or more.</td> </tr>
112 * <tr> <td> ? </td> <td>Zero or one.</td> </tr>
113 * <tr> <td> + </td> <td>One or more.</td> </tr>
114 * <tr> <td> {<i>n</i>} </td> <td>Exactly <i>n</i>.</td> </tr>
115 * <tr> <td> {<i>n,</i>} </td> <td>At least <i>n</i>.</td> </tr>
116 * <tr> <td> {<i>n</i>,<i>m</i>} </td> <td>At least <i>n</i> but not more than <i>m</i>.</td> </tr>
117 * </table>
118 * <p>Quantifiers are "greedy" by default, meaning that they will match the longest possible input
119 * sequence. There are also non-greedy quantifiers that match the shortest possible input sequence.
120 * They're same as the greedy ones but with a trailing {@code ?}:
121 * <table>
122 * <tr> <td> *? </td> <td>Zero or more (non-greedy).</td> </tr>
123 * <tr> <td> ?? </td> <td>Zero or one (non-greedy).</td> </tr>
124 * <tr> <td> +? </td> <td>One or more (non-greedy).</td> </tr>
125 * <tr> <td> {<i>n</i>}? </td> <td>Exactly <i>n</i> (non-greedy).</td> </tr>
126 * <tr> <td> {<i>n,</i>}? </td> <td>At least <i>n</i> (non-greedy).</td> </tr>
127 * <tr> <td> {<i>n</i>,<i>m</i>}? </td> <td>At least <i>n</i> but not more than <i>m</i> (non-greedy).</td> </tr>
128 * </table>
129 * <p>Quantifiers allow backtracking by default. There are also possessive quantifiers to prevent
130 * backtracking. They're same as the greedy ones but with a trailing {@code +}:
131 * <table>
132 * <tr> <td> *+ </td> <td>Zero or more (possessive).</td> </tr>
133 * <tr> <td> ?+ </td> <td>Zero or one (possessive).</td> </tr>
134 * <tr> <td> ++ </td> <td>One or more (possessive).</td> </tr>
135 * <tr> <td> {<i>n</i>}+ </td> <td>Exactly <i>n</i> (possessive).</td> </tr>
136 * <tr> <td> {<i>n,</i>}+ </td> <td>At least <i>n</i> (possessive).</td> </tr>
137 * <tr> <td> {<i>n</i>,<i>m</i>}+ </td> <td>At least <i>n</i> but not more than <i>m</i> (possessive).</td> </tr>
138 * </table>
139 *
140 * <h3>Zero-width assertions</h3>
141 * <p><table>
142 * <tr> <td> ^ </td> <td>At beginning of line.</td> </tr>
143 * <tr> <td> $ </td> <td>At end of line.</td> </tr>
144 * <tr> <td> \A </td> <td>At beginning of input.</td> </tr>
145 * <tr> <td> \b </td> <td>At word boundary.</td> </tr>
146 * <tr> <td> \B </td> <td>At non-word boundary.</td> </tr>
147 * <tr> <td> \G </td> <td>At end of previous match.</td> </tr>
148 * <tr> <td> \z </td> <td>At end of input.</td> </tr>
149 * <tr> <td> \Z </td> <td>At end of input, or before newline at end.</td> </tr>
150 * </table>
151 *
152 * <h3>Look-around assertions</h3>
153 * <p>Look-around assertions assert that the subpattern does (positive) or doesn't (negative) match
154 * after (look-ahead) or before (look-behind) the current position, without including the matched
155 * text in the containing match. The maximum length of possible matches for look-behind patterns
156 * must not be unbounded.
157 * <p><table>
158 * <tr> <td> (?=<i>a</i>) </td> <td>Zero-width positive look-ahead.</td> </tr>
159 * <tr> <td> (?!<i>a</i>) </td> <td>Zero-width negative look-ahead.</td> </tr>
160 * <tr> <td> (?&lt;=<i>a</i>) </td> <td>Zero-width positive look-behind.</td> </tr>
161 * <tr> <td> (?&lt;!<i>a</i>) </td> <td>Zero-width negative look-behind.</td> </tr>
162 * </table>
163 *
164 * <h3>Groups</h3>
165 *
166 * <p><table>
167 * <tr> <td> (<i>a</i>) </td> <td>A capturing group.</td> </tr>
168 * <tr> <td> (?:<i>a</i>) </td> <td>A non-capturing group.</td> </tr>
169 * <tr> <td> (?&gt;<i>a</i>) </td> <td>An independent non-capturing group. (The first match of the subgroup is the only match tried.)</td> </tr>
170 * <tr> <td> \<i>n</i> </td> <td>The text already matched by capturing group <i>n</i>.</td> </tr>
171 * </table>
172 * <p>Explicit capturing groups are numbered from 1, and available via {@link Matcher#group}.
173 * Group 0 represents the whole match.
174 *
175 * <h3>Operators</h3>
176 * <p><table>
177 * <tr> <td> <i>ab</i> </td> <td>Expression <i>a</i> followed by expression <i>b</i>.</td> </tr>
178 * <tr> <td> <i>a</i>|<i>b</i> </td> <td>Either expression <i>a</i> or expression <i>b</i>.</td> </tr>
179 * </table>
180 *
181 * <a name="flags"><h3>Flags</h3></a>
182 * <p><table>
183 * <tr> <td> (?dimsux-dimsux:<i>a</i>) </td> <td>Evaluates the expression <i>a</i> with the given flags enabled/disabled.</td> </tr>
184 * <tr> <td> (?dimsux-dimsux) </td> <td>Evaluates the rest of the pattern with the given flags enabled/disabled.</td> </tr>
185 * </table>
186 *
187 * <p>The flags are:
188 * <table>
189 * <tr><td>{@code i}</td> <td>{@link #CASE_INSENSITIVE}</td> <td>case insensitive matching</td></tr>
190 * <tr><td>{@code d}</td> <td>{@link #UNIX_LINES}</td>       <td>only accept {@code '\n'} as a line terminator</td></tr>
191 * <tr><td>{@code m}</td> <td>{@link #MULTILINE}</td>        <td>allow {@code ^} and {@code $} to match beginning/end of any line</td></tr>
192 * <tr><td>{@code s}</td> <td>{@link #DOTALL}</td>           <td>allow {@code .} to match {@code '\n'} ("s" for "single line")</td></tr>
193 * <tr><td>{@code u}</td> <td>{@link #UNICODE_CASE}</td>     <td>enable Unicode case folding</td></tr>
194 * <tr><td>{@code x}</td> <td>{@link #COMMENTS}</td>         <td>allow whitespace and comments</td></tr>
195 * </table>
196 * <p>Either set of flags may be empty. For example, {@code (?i-m)} would turn on case-insensitivity
197 * and turn off multiline mode, {@code (?i)} would just turn on case-insensitivity,
198 * and {@code (?-m)} would just turn off multiline mode.
199 * <p>Note that on Android, {@code UNICODE_CASE} is always on: case-insensitive matching will
200 * always be Unicode-aware.
201 * <p>There are two other flags not settable via this mechanism: {@link #CANON_EQ} and
202 * {@link #LITERAL}. Attempts to use {@link #CANON_EQ} on Android will throw an exception.
203 * </span>
204 *
205 * <h3>Implementation notes</h3>
206 *
207 * The regular expression implementation used in Android is provided by
208 * <a href="http://www.icu-project.org">ICU</a>. The notation for the regular
209 * expressions is mostly a superset of those used in other Java language
210 * implementations. This means that existing applications will normally work as
211 * expected, but in rare cases Android may accept a regular expression that is
212 * not accepted by other implementations.
213 *
214 * <p>In some cases, Android will recognize that a regular expression is a simple
215 * special case that can be handled more efficiently. This is true of both the convenience methods
216 * in {@code String} and the methods in {@code Pattern}.
217 *
218 * @see Matcher
219 */
220public final class Pattern implements Serializable {
221
222    private static final long serialVersionUID = 5073258162644648461L;
223
224    /**
225     * This constant specifies that a pattern matches Unix line endings ('\n')
226     * only against the '.', '^', and '$' meta characters. Corresponds to {@code (?d)}.
227     */
228    public static final int UNIX_LINES = 0x01;
229
230    /**
231     * This constant specifies that a {@code Pattern} is matched
232     * case-insensitively. That is, the patterns "a+" and "A+" would both match
233     * the string "aAaAaA". See {@link #UNICODE_CASE}. Corresponds to {@code (?i)}.
234     */
235    public static final int CASE_INSENSITIVE = 0x02;
236
237    /**
238     * This constant specifies that a {@code Pattern} may contain whitespace or
239     * comments. Otherwise comments and whitespace are taken as literal
240     * characters. Corresponds to {@code (?x)}.
241     */
242    public static final int COMMENTS = 0x04;
243
244    /**
245     * This constant specifies that the meta characters '^' and '$' match only
246     * the beginning and end of an input line, respectively. Normally, they
247     * match the beginning and the end of the complete input. Corresponds to {@code (?m)}.
248     */
249    public static final int MULTILINE = 0x08;
250
251    /**
252     * This constant specifies that the whole {@code Pattern} is to be taken
253     * literally, that is, all meta characters lose their meanings.
254     */
255    public static final int LITERAL = 0x10;
256
257    /**
258     * This constant specifies that the '.' meta character matches arbitrary
259     * characters, including line endings, which is normally not the case.
260     * Corresponds to {@code (?s)}.
261     */
262    public static final int DOTALL = 0x20;
263
264    /**
265     * This constant specifies that a {@code Pattern} that uses case-insensitive matching
266     * will use Unicode case folding. On Android, {@code UNICODE_CASE} is always on:
267     * case-insensitive matching will always be Unicode-aware. If your code is intended to
268     * be portable and uses case-insensitive matching on non-ASCII characters, you should
269     * use this flag. Corresponds to {@code (?u)}.
270     */
271    public static final int UNICODE_CASE = 0x40;
272
273    /**
274     * This constant specifies that a character in a {@code Pattern} and a
275     * character in the input string only match if they are canonically
276     * equivalent. It is (currently) not supported in Android.
277     */
278    public static final int CANON_EQ = 0x80;
279
280    private final String pattern;
281    private final int flags;
282
283    /**
284     * Holds a handle (a pointer, actually) for the native ICU pattern.
285     */
286    transient int mNativePattern;
287
288    /**
289     * Holds the number of groups in the pattern.
290     */
291    transient int mGroupCount;
292
293    /**
294     * Returns a {@link Matcher} for this pattern applied to the given {@code input}.
295     * The {@code Matcher} can be used to match the {@code Pattern} against the
296     * whole input, find occurrences of the {@code Pattern} in the input, or
297     * replace parts of the input.
298     */
299    public Matcher matcher(CharSequence input) {
300        return new Matcher(this, input);
301    }
302
303    /**
304     * Splits the given {@code input} at occurrences of this pattern.
305     *
306     * <p>If this pattern does not occur in the input, the result is an
307     * array containing the input (converted from a {@code CharSequence} to
308     * a {@code String}).
309     *
310     * <p>Otherwise, the {@code limit} parameter controls the contents of the
311     * returned array as described below.
312     *
313     * @param limit
314     *            Determines the maximum number of entries in the resulting
315     *            array, and the treatment of trailing empty strings.
316     *            <ul>
317     *            <li>For n &gt; 0, the resulting array contains at most n
318     *            entries. If this is fewer than the number of matches, the
319     *            final entry will contain all remaining input.
320     *            <li>For n &lt; 0, the length of the resulting array is
321     *            exactly the number of occurrences of the {@code Pattern}
322     *            plus one for the text after the final separator.
323     *            All entries are included.
324     *            <li>For n == 0, the result is as for n &lt; 0, except
325     *            trailing empty strings will not be returned. (Note that
326     *            the case where the input is itself an empty string is
327     *            special, as described above, and the limit parameter does
328     *            not apply there.)
329     *            </ul>
330     */
331    public String[] split(CharSequence input, int limit) {
332        return Splitter.split(this, pattern, input.toString(), limit);
333    }
334
335    /**
336     * Equivalent to {@code split(input, 0)}.
337     */
338    public String[] split(CharSequence input) {
339        return split(input, 0);
340    }
341
342    /**
343     * Returns the regular expression supplied to {@code compile}.
344     */
345    public String pattern() {
346        return pattern;
347    }
348
349    @Override
350    public String toString() {
351        return pattern;
352    }
353
354    /**
355     * Returns the flags supplied to {@code compile}.
356     */
357    public int flags() {
358        return flags;
359    }
360
361    /**
362     * Returns a compiled form of the given {@code regularExpression}, as modified by the
363     * given {@code flags}. See the <a href="#flags">flags overview</a> for more on flags.
364     *
365     * @throws PatternSyntaxException if the regular expression is syntactically incorrect.
366     *
367     * @see #CANON_EQ
368     * @see #CASE_INSENSITIVE
369     * @see #COMMENTS
370     * @see #DOTALL
371     * @see #LITERAL
372     * @see #MULTILINE
373     * @see #UNICODE_CASE
374     * @see #UNIX_LINES
375     */
376    public static Pattern compile(String regularExpression, int flags) throws PatternSyntaxException {
377        return new Pattern(regularExpression, flags);
378    }
379
380    /**
381     * Equivalent to {@code Pattern.compile(pattern, 0)}.
382     */
383    public static Pattern compile(String pattern) {
384        return new Pattern(pattern, 0);
385    }
386
387    private Pattern(String pattern, int flags) throws PatternSyntaxException {
388        if ((flags & CANON_EQ) != 0) {
389            throw new UnsupportedOperationException("CANON_EQ flag not supported");
390        }
391        this.pattern = pattern;
392        this.flags = flags;
393        compileImpl(pattern, flags);
394    }
395
396    private void compileImpl(String pattern, int flags) throws PatternSyntaxException {
397        if (pattern == null) {
398            throw new NullPointerException();
399        }
400
401        if ((flags & LITERAL) != 0) {
402            pattern = quote(pattern);
403        }
404
405        // These are the flags natively supported by ICU.
406        // They even have the same value in native code.
407        flags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES);
408
409        mNativePattern = NativeRegEx.open(pattern, flags);
410        mGroupCount = NativeRegEx.groupCount(mNativePattern);
411    }
412
413    /**
414     * Tests whether the given {@code regularExpression} matches the given {@code input}.
415     * Equivalent to {@code Pattern.compile(regularExpression).matcher(input).matches()}.
416     * If the same regular expression is to be used for multiple operations, it may be more
417     * efficient to reuse a compiled {@code Pattern}.
418     *
419     * @see Pattern#compile(java.lang.String, int)
420     * @see Matcher#matches()
421     */
422    public static boolean matches(String regularExpression, CharSequence input) {
423        return new Matcher(new Pattern(regularExpression, 0), input).matches();
424    }
425
426    /**
427     * Quotes the given {@code string} using "\Q" and "\E", so that all
428     * meta-characters lose their special meaning. This method correctly
429     * escapes embedded instances of "\Q" or "\E". If the entire result
430     * is to be passed verbatim to {@link #compile}, it's usually clearer
431     * to use the {@link #LITERAL} flag instead.
432     */
433    public static String quote(String string) {
434        StringBuilder sb = new StringBuilder();
435        sb.append("\\Q");
436        int apos = 0;
437        int k;
438        while ((k = string.indexOf("\\E", apos)) >= 0) {
439            sb.append(string.substring(apos, k + 2)).append("\\\\E\\Q");
440            apos = k + 2;
441        }
442        return sb.append(string.substring(apos)).append("\\E").toString();
443    }
444
445    @Override
446    protected void finalize() throws Throwable {
447        try {
448            if (mNativePattern != 0) {
449                NativeRegEx.close(mNativePattern);
450            }
451        } finally {
452            super.finalize();
453        }
454    }
455
456    private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
457        s.defaultReadObject();
458        compileImpl(pattern, flags);
459    }
460}
461