Patterns.java revision db990751ef8e535ea5cb3d527e36936e119095e8
1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.util;
18
19import java.util.regex.Matcher;
20import java.util.regex.Pattern;
21
22/**
23 * Commonly used regular expression patterns.
24 */
25public class Patterns {
26    /**
27     *  Regular expression to match all IANA top-level domains.
28     *  List accurate as of 2010/05/06.  List taken from:
29     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
30     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
31     */
32    public static final String TOP_LEVEL_DOMAIN_STR =
33        "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
34        + "|(biz|b[abdefghijmnorstvwyz])"
35        + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
36        + "|d[ejkmoz]"
37        + "|(edu|e[cegrstu])"
38        + "|f[ijkmor]"
39        + "|(gov|g[abdefghilmnpqrstuwy])"
40        + "|h[kmnrtu]"
41        + "|(info|int|i[delmnoqrst])"
42        + "|(jobs|j[emop])"
43        + "|k[eghimnprwyz]"
44        + "|l[abcikrstuvy]"
45        + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
46        + "|(name|net|n[acefgilopruz])"
47        + "|(org|om)"
48        + "|(pro|p[aefghklmnrstwy])"
49        + "|qa"
50        + "|r[eosuw]"
51        + "|s[abcdeghijklmnortuvyz]"
52        + "|(tel|travel|t[cdfghjklmnoprtvwz])"
53        + "|u[agksyz]"
54        + "|v[aceginu]"
55        + "|w[fs]"
56        + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-mgbaam7a8h|xn\\-\\-mgberp4a5d4ar|xn\\-\\-wgbh1c|xn\\-\\-zckzah)"
57        + "|y[et]"
58        + "|z[amw])";
59
60    /**
61     *  Regular expression pattern to match all IANA top-level domains.
62     */
63    public static final Pattern TOP_LEVEL_DOMAIN =
64        Pattern.compile(TOP_LEVEL_DOMAIN_STR);
65
66    /**
67     *  Regular expression to match all IANA top-level domains for WEB_URL.
68     *  List accurate as of 2010/05/06.  List taken from:
69     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
70     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
71     */
72    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
73        "(?:"
74        + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
75        + "|(?:biz|b[abdefghijmnorstvwyz])"
76        + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
77        + "|d[ejkmoz]"
78        + "|(?:edu|e[cegrstu])"
79        + "|f[ijkmor]"
80        + "|(?:gov|g[abdefghilmnpqrstuwy])"
81        + "|h[kmnrtu]"
82        + "|(?:info|int|i[delmnoqrst])"
83        + "|(?:jobs|j[emop])"
84        + "|k[eghimnprwyz]"
85        + "|l[abcikrstuvy]"
86        + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
87        + "|(?:name|net|n[acefgilopruz])"
88        + "|(?:org|om)"
89        + "|(?:pro|p[aefghklmnrstwy])"
90        + "|qa"
91        + "|r[eosuw]"
92        + "|s[abcdeghijklmnortuvyz]"
93        + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
94        + "|u[agksyz]"
95        + "|v[aceginu]"
96        + "|w[fs]"
97        + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-mgbaam7a8h|xn\\-\\-mgberp4a5d4ar|xn\\-\\-wgbh1c|xn\\-\\-zckzah)"
98        + "|y[et]"
99        + "|z[amw]))";
100
101    /**
102     * Good characters for Internationalized Resource Identifiers (IRI).
103     * This comprises most common used Unicode characters allowed in IRI
104     * as detailed in RFC 3987.
105     * Specifically, those two byte Unicode characters are not included.
106     */
107    public static final String GOOD_IRI_CHAR =
108        "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
109
110    /**
111     *  Regular expression pattern to match most part of RFC 3987
112     *  Internationalized URLs, aka IRIs.  Commonly used Unicode characters are
113     *  added.
114     */
115    public static final Pattern WEB_URL = Pattern.compile(
116        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
117        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
118        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
119        + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+"   // named host
120        + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
121        + "|(?:(?:25[0-5]|2[0-4]" // or ip address
122        + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
123        + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
124        + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
125        + "|[1-9][0-9]|[0-9])))"
126        + "(?:\\:\\d{1,5})?)" // plus option port number
127        + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
128        + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
129        + "(?:\\b|$)"); // and finally, a word boundary or end of
130                        // input.  This is to stop foo.sure from
131                        // matching as foo.su
132
133    public static final Pattern IP_ADDRESS
134        = Pattern.compile(
135            "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]"
136            + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]"
137            + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
138            + "|[1-9][0-9]|[0-9]))");
139
140    public static final Pattern DOMAIN_NAME
141        = Pattern.compile(
142            "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
143            + TOP_LEVEL_DOMAIN + ")|"
144            + IP_ADDRESS + ")");
145
146    public static final Pattern EMAIL_ADDRESS
147        = Pattern.compile(
148            "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" +
149            "\\@" +
150            "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" +
151            "(" +
152                "\\." +
153                "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" +
154            ")+"
155        );
156
157    /**
158     * This pattern is intended for searching for things that look like they
159     * might be phone numbers in arbitrary text, not for validating whether
160     * something is in fact a phone number.  It will miss many things that
161     * are legitimate phone numbers.
162     *
163     * <p> The pattern matches the following:
164     * <ul>
165     * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
166     * may follow.
167     * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
168     * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
169     * </ul>
170     */
171    public static final Pattern PHONE
172        = Pattern.compile(                                  // sdd = space, dot, or dash
173                "(\\+[0-9]+[\\- \\.]*)?"                    // +<digits><sdd>*
174                + "(\\([0-9]+\\)[\\- \\.]*)?"               // (<digits>)<sdd>*
175                + "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit>
176
177    /**
178     *  Convenience method to take all of the non-null matching groups in a
179     *  regex Matcher and return them as a concatenated string.
180     *
181     *  @param matcher      The Matcher object from which grouped text will
182     *                      be extracted
183     *
184     *  @return             A String comprising all of the non-null matched
185     *                      groups concatenated together
186     */
187    public static final String concatGroups(Matcher matcher) {
188        StringBuilder b = new StringBuilder();
189        final int numGroups = matcher.groupCount();
190
191        for (int i = 1; i <= numGroups; i++) {
192            String s = matcher.group(i);
193
194            System.err.println("Group(" + i + ") : " + s);
195
196            if (s != null) {
197                b.append(s);
198            }
199        }
200
201        return b.toString();
202    }
203
204    /**
205     * Convenience method to return only the digits and plus signs
206     * in the matching string.
207     *
208     * @param matcher      The Matcher object from which digits and plus will
209     *                     be extracted
210     *
211     * @return             A String comprising all of the digits and plus in
212     *                     the match
213     */
214    public static final String digitsAndPlusOnly(Matcher matcher) {
215        StringBuilder buffer = new StringBuilder();
216        String matchingRegion = matcher.group();
217
218        for (int i = 0, size = matchingRegion.length(); i < size; i++) {
219            char character = matchingRegion.charAt(i);
220
221            if (character == '+' || Character.isDigit(character)) {
222                buffer.append(character);
223            }
224        }
225        return buffer.toString();
226    }
227
228    /**
229     * Do not create this static utility class.
230     */
231    private Patterns() {}
232}
233