1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package java.util.regex;
18
19import java.io.Serializable;
20import java.util.ArrayList;
21import com.ibm.icu4jni.regex.NativeRegEx;
22
23/**
24 * Represents a pattern used for matching, searching, or replacing strings.
25 * {@code Pattern}s are specified in terms of regular expressions and compiled
26 * using an instance of this class. They are then used in conjunction with a
27 * {@link Matcher} to perform the actual search.
28 * <p/>
29 * A typical use case looks like this:
30 * <p/>
31 * <pre>
32 * Pattern p = Pattern.compile("Hello, A[a-z]*!");
33 *
34 * Matcher m = p.matcher("Hello, Android!");
35 * boolean b1 = m.matches(); // true
36 *
37 * m.setInput("Hello, Robot!");
38 * boolean b2 = m.matches(); // false
39 * </pre>
40 * <p/>
41 * The above code could also be written in a more compact fashion, though this
42 * variant is less efficient, since {@code Pattern} and {@code Matcher} objects
43 * are created on the fly instead of being reused.
44 * fashion:
45 * <pre>
46 *     boolean b1 = Pattern.matches("Hello, A[a-z]*!", "Hello, Android!"); // true
47 *     boolean b2 = Pattern.matches("Hello, A[a-z]*!", "Hello, Robot!");   // false
48 * </pre>
49 * <p/>
50 * Please consult the <a href="package-descr.html">package documentation</a> for an
51 * overview of the regular expression syntax used in this class as well as
52 * Android-specific implementation details.
53 *
54 * @see Matcher
55 * @since Android 1.0
56 */
57public final class Pattern implements Serializable {
58
59    private static final long serialVersionUID = 5073258162644648461L;
60
61    /**
62     * This constant specifies that a pattern matches Unix line endings ('\n')
63     * only against the '.', '^', and '$' meta characters.
64     */
65    public static final int UNIX_LINES = 0x01;
66
67    /**
68     * This constant specifies that a {@code Pattern} is matched
69     * case-insensitively. That is, the patterns "a+" and "A+" would both match
70     * the string "aAaAaA".
71     * <p>
72     * Note: For Android, the {@code CASE_INSENSITIVE} constant
73     * (currently) always includes the meaning of the {@link #UNICODE_CASE}
74     * constant. So if case insensitivity is enabled, this automatically extends
75     * to all Unicode characters. The {@code UNICODE_CASE} constant itself has
76     * no special consequences.
77     */
78    public static final int CASE_INSENSITIVE = 0x02;
79
80    /**
81     * This constant specifies that a {@code Pattern} may contain whitespace or
82     * comments. Otherwise comments and whitespace are taken as literal
83     * characters.
84     */
85    public static final int COMMENTS = 0x04;
86
87    /**
88     * This constant specifies that the meta characters '^' and '$' match only
89     * the beginning and end end of an input line, respectively. Normally, they
90     * match the beginning and the end of the complete input.
91     */
92    public static final int MULTILINE = 0x08;
93
94    /**
95     * This constant specifies that the whole {@code Pattern} is to be taken
96     * literally, that is, all meta characters lose their meanings.
97     */
98    public static final int LITERAL = 0x10;
99
100    /**
101     * This constant specifies that the '.' meta character matches arbitrary
102     * characters, including line endings, which is normally not the case.
103     */
104    public static final int DOTALL = 0x20;
105
106    /**
107     * This constant specifies that a {@code Pattern} is matched
108     * case-insensitively with regard to all Unicode characters. It is used in
109     * conjunction with the {@link #CASE_INSENSITIVE} constant to extend its
110     * meaning to all Unicode characters.
111     * <p>
112     * Note: For Android, the {@code CASE_INSENSITIVE} constant
113     * (currently) always includes the meaning of the {@code UNICODE_CASE}
114     * constant. So if case insensitivity is enabled, this automatically extends
115     * to all Unicode characters. The {@code UNICODE_CASE} constant then has no
116     * special consequences.
117     */
118    public static final int UNICODE_CASE = 0x40;
119
120    /**
121     * This constant specifies that a character in a {@code Pattern} and a
122     * character in the input string only match if they are canonically
123     * equivalent. It is (currently) not supported in Android.
124     */
125    public static final int CANON_EQ = 0x80;
126
127    /**
128     * Holds the regular expression.
129     */
130    private String pattern;
131
132    /**
133     * Holds the flags used when compiling this pattern.
134     */
135    private int flags;
136
137    /**
138     * Holds a handle (a pointer, actually) for the native ICU pattern.
139     */
140    transient int mNativePattern;
141
142    /**
143     * Holds the number of groups in the pattern.
144     */
145    transient int mGroupCount;
146
147
148    /**
149     * Returns a {@link Matcher} for the {@code Pattern} and a given input. The
150     * {@code Matcher} can be used to match the {@code Pattern} against the
151     * whole input, find occurrences of the {@code Pattern} in the input, or
152     * replace parts of the input.
153     *
154     * @param input
155     *            the input to process.
156     *
157     * @return the resulting {@code Matcher}.
158     */
159    public Matcher matcher(CharSequence input) {
160        return new Matcher(this, input);
161    }
162
163    /**
164     * Splits the given input sequence at occurrences of this {@code Pattern}.
165     *
166     * <p>If this {@code Pattern} does not occur in the input, the result is an
167     * array containing the input (converted from a {@code CharSequence} to
168     * a {@code String}).
169     *
170     * <p>Otherwise, the {@code limit} parameter controls the contents of the
171     * returned array as described below.
172     *
173     * @param inputSeq
174     *            the input sequence.
175     * @param limit
176     *            Determines the maximum number of entries in the resulting
177     *            array, and the treatment of trailing empty strings.
178     *            <ul>
179     *            <li>For n &gt; 0, the resulting array contains at most n
180     *            entries. If this is fewer than the number of matches, the
181     *            final entry will contain all remaining input.
182     *            <li>For n &lt; 0, the length of the resulting array is
183     *            exactly the number of occurrences of the {@code Pattern}
184     *            plus one for the text after the final separator.
185     *            All entries are included.
186     *            <li>For n == 0, the result is as for n &lt; 0, except
187     *            trailing empty strings will not be returned. (Note that
188     *            the case where the input is itself an empty string is
189     *            special, as described above, and the limit parameter does
190     *            not apply there.)
191     *            </ul>
192     *
193     * @return the resulting array.
194     */
195    public String[] split(CharSequence inputSeq, int limit) {
196        if (inputSeq.length() == 0) {
197            // Unlike Perl, which considers the result of splitting the empty
198            // string to be the empty array, Java returns an array containing
199            // the empty string.
200            return new String[] { "" };
201        }
202
203        int maxLength = limit <= 0 ? Integer.MAX_VALUE : limit;
204
205        String input = inputSeq.toString();
206        ArrayList<String> list = new ArrayList<String>();
207
208        Matcher matcher = new Matcher(this, inputSeq);
209        int savedPos = 0;
210
211        // Add text preceding each occurrence, if enough space.
212        while(matcher.find() && list.size() + 1 < maxLength) {
213            list.add(input.substring(savedPos, matcher.start()));
214            savedPos = matcher.end();
215        }
216
217        // Add trailing text if enough space.
218        if (list.size() < maxLength) {
219            if (savedPos < input.length()) {
220                list.add(input.substring(savedPos));
221            } else {
222                list.add("");
223            }
224        }
225
226        // Remove trailing empty matches in the limit == 0 case.
227        if (limit == 0) {
228            int i = list.size() - 1;
229            while (i >= 0 && "".equals(list.get(i))) {
230                list.remove(i);
231                i--;
232            }
233        }
234
235        return list.toArray(new String[list.size()]);
236    }
237
238    /**
239     * Splits a given input around occurrences of a regular expression. This is
240     * a convenience method that is equivalent to calling the method
241     * {@link #split(java.lang.CharSequence, int)} with a limit of 0.
242     *
243     * @param input
244     *            the input sequence.
245     *
246     * @return the resulting array.
247     */
248    public String[] split(CharSequence input) {
249        return split(input, 0);
250    }
251
252    /**
253     * Returns the regular expression that was compiled into this
254     * {@code Pattern}.
255     *
256     * @return the regular expression.
257     */
258    public String pattern() {
259        return pattern;
260    }
261
262    @Override
263    public String toString() {
264        return pattern;
265    }
266
267    /**
268     * Returns the flags that have been set for this {@code Pattern}.
269     *
270     * @return the flags that have been set. A combination of the constants
271     *         defined in this class.
272     *
273     * @see #CANON_EQ
274     * @see #CASE_INSENSITIVE
275     * @see #COMMENTS
276     * @see #DOTALL
277     * @see #LITERAL
278     * @see #MULTILINE
279     * @see #UNICODE_CASE
280     * @see #UNIX_LINES
281     */
282    public int flags() {
283        return flags;
284    }
285
286    /**
287     * Compiles a regular expression, creating a new {@code Pattern} instance in
288     * the process. Allows to set some flags that modify the behavior of the
289     * {@code Pattern}.
290     *
291     * @param pattern
292     *            the regular expression.
293     * @param flags
294     *            the flags to set. Basically, any combination of the constants
295     *            defined in this class is valid.
296     *            <p>
297     *            Note: Currently, the {@link #CASE_INSENSITIVE} and
298     *            {@link #UNICODE_CASE} constants have slightly special behavior
299     *            in Android, and the {@link #CANON_EQ} constant is not
300     *            supported at all.
301     *
302     * @return the new {@code Pattern} instance.
303     *
304     * @throws PatternSyntaxException
305     *             if the regular expression is syntactically incorrect.
306     *
307     * @see #CANON_EQ
308     * @see #CASE_INSENSITIVE
309     * @see #COMMENTS
310     * @see #DOTALL
311     * @see #LITERAL
312     * @see #MULTILINE
313     * @see #UNICODE_CASE
314     * @see #UNIX_LINES
315     */
316    public static Pattern compile(String pattern, int flags) throws PatternSyntaxException {
317        return new Pattern(pattern, flags);
318    }
319
320    /**
321     * Creates a new {@code Pattern} instance from a given regular expression
322     * and flags.
323     *
324     * @param pattern
325     *            the regular expression.
326     * @param flags
327     *            the flags to set. Any combination of the constants defined in
328     *            this class is valid.
329     *
330     * @throws PatternSyntaxException
331     *             if the regular expression is syntactically incorrect.
332     */
333    private Pattern(String pattern, int flags) throws PatternSyntaxException {
334        if ((flags & CANON_EQ) != 0) {
335            throw new UnsupportedOperationException("CANON_EQ flag not supported");
336        }
337
338        this.pattern = pattern;
339        this.flags = flags;
340
341        compileImpl(pattern, flags);
342    }
343
344    /**
345     * Compiles a regular expression, creating a new Pattern instance in the
346     * process. This is actually a convenience method that calls {@link
347     * #compile(String, int)} with a {@code flags} value of zero.
348     *
349     * @param pattern
350     *            the regular expression.
351     *
352     * @return the new {@code Pattern} instance.
353     *
354     * @throws PatternSyntaxException
355     *             if the regular expression is syntactically incorrect.
356     */
357    public static Pattern compile(String pattern) {
358        return new Pattern(pattern, 0);
359    }
360
361    /**
362     * Compiles the given regular expression using the given flags. Used
363     * internally only.
364     *
365     * @param pattern
366     *            the regular expression.
367     * @param flags
368     *            the flags.
369     */
370    private void compileImpl(String pattern, int flags) throws PatternSyntaxException {
371        if (pattern == null) {
372            throw new NullPointerException();
373        }
374
375        if ((flags & LITERAL) != 0) {
376            pattern = quote(pattern);
377        }
378
379        // These are the flags natively supported by ICU.
380        // They even have the same value in native code.
381        flags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES);
382
383        mNativePattern = NativeRegEx.open(pattern, flags);
384        mGroupCount = NativeRegEx.groupCount(mNativePattern);
385    }
386
387    /**
388     * Tries to match a given regular expression against a given input. This is
389     * actually nothing but a convenience method that compiles the regular
390     * expression into a {@code Pattern}, builds a {@link Matcher} for it, and
391     * then does the match. If the same regular expression is used for multiple
392     * operations, it is recommended to compile it into a {@code Pattern}
393     * explicitly and request a reusable {@code Matcher}.
394     *
395     * @param regex
396     *            the regular expression.
397     * @param input
398     *            the input to process.
399     *
400     * @return true if and only if the {@code Pattern} matches the input.
401     *
402     * @see Pattern#compile(java.lang.String, int)
403     * @see Matcher#matches()
404     */
405    public static boolean matches(String regex, CharSequence input) {
406        return new Matcher(new Pattern(regex, 0), input).matches();
407    }
408
409    /**
410     * Quotes a given string using "\Q" and "\E", so that all other
411     * meta-characters lose their special meaning. If the string is used for a
412     * {@code Pattern} afterwards, it can only be matched literally.
413     *
414     * @param s
415     *            the string to quote.
416     *
417     * @return the quoted string.
418     */
419    public static String quote(String s) {
420        StringBuilder sb = new StringBuilder().append("\\Q"); //$NON-NLS-1$
421        int apos = 0;
422        int k;
423        while ((k = s.indexOf("\\E", apos)) >= 0) { //$NON-NLS-1$
424            sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q"); //$NON-NLS-1$
425            apos = k + 2;
426        }
427
428        return sb.append(s.substring(apos)).append("\\E").toString(); //$NON-NLS-1$
429    }
430
431    @Override
432    protected void finalize() throws Throwable {
433        try {
434            if (mNativePattern != 0) {
435                NativeRegEx.close(mNativePattern);
436            }
437        }
438        finally {
439            super.finalize();
440        }
441    }
442
443    /**
444     * Serialization support
445     */
446    private void readObject(java.io.ObjectInputStream s)
447            throws java.io.IOException, ClassNotFoundException {
448        s.defaultReadObject();
449
450        compileImpl(pattern, flags);
451    }
452
453}
454