1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text.util;
18
19import java.util.regex.Matcher;
20import java.util.regex.Pattern;
21
22/**
23 * @hide
24 */
25public class Regex {
26    /**
27     *  Regular expression pattern to match all IANA top-level domains.
28     *  List accurate as of 2007/06/15.  List taken from:
29     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
30     *  This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py
31     */
32    public static final Pattern TOP_LEVEL_DOMAIN_PATTERN
33        = Pattern.compile(
34                "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
35                + "|(biz|b[abdefghijmnorstvwyz])"
36                + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
37                + "|d[ejkmoz]"
38                + "|(edu|e[cegrstu])"
39                + "|f[ijkmor]"
40                + "|(gov|g[abdefghilmnpqrstuwy])"
41                + "|h[kmnrtu]"
42                + "|(info|int|i[delmnoqrst])"
43                + "|(jobs|j[emop])"
44                + "|k[eghimnrwyz]"
45                + "|l[abcikrstuvy]"
46                + "|(mil|mobi|museum|m[acdghklmnopqrstuvwxyz])"
47                + "|(name|net|n[acefgilopruz])"
48                + "|(org|om)"
49                + "|(pro|p[aefghklmnrstwy])"
50                + "|qa"
51                + "|r[eouw]"
52                + "|s[abcdeghijklmnortuvyz]"
53                + "|(tel|travel|t[cdfghjklmnoprtvwz])"
54                + "|u[agkmsyz]"
55                + "|v[aceginu]"
56                + "|w[fs]"
57                + "|y[etu]"
58                + "|z[amw])");
59
60    /**
61     *  Regular expression pattern to match RFC 1738 URLs
62     *  List accurate as of 2007/06/15.  List taken from:
63     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
64     *  This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py
65     */
66    public static final Pattern WEB_URL_PATTERN
67        = Pattern.compile(
68            "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
69            + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
70            + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
71            + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
72            + "(?:"   // plus top level domain
73            + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
74            + "|(?:biz|b[abdefghijmnorstvwyz])"
75            + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
76            + "|d[ejkmoz]"
77            + "|(?:edu|e[cegrstu])"
78            + "|f[ijkmor]"
79            + "|(?:gov|g[abdefghilmnpqrstuwy])"
80            + "|h[kmnrtu]"
81            + "|(?:info|int|i[delmnoqrst])"
82            + "|(?:jobs|j[emop])"
83            + "|k[eghimnrwyz]"
84            + "|l[abcikrstuvy]"
85            + "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])"
86            + "|(?:name|net|n[acefgilopruz])"
87            + "|(?:org|om)"
88            + "|(?:pro|p[aefghklmnrstwy])"
89            + "|qa"
90            + "|r[eouw]"
91            + "|s[abcdeghijklmnortuvyz]"
92            + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
93            + "|u[agkmsyz]"
94            + "|v[aceginu]"
95            + "|w[fs]"
96            + "|y[etu]"
97            + "|z[amw]))"
98            + "|(?:(?:25[0-5]|2[0-4]" // or ip address
99            + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
100            + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
101            + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
102            + "|[1-9][0-9]|[0-9])))"
103            + "(?:\\:\\d{1,5})?)" // plus option port number
104            + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
105            + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
106            + "(?:\\b|$)"); // and finally, a word boundary or end of
107                            // input.  This is to stop foo.sure from
108                            // matching as foo.su
109
110    public static final Pattern IP_ADDRESS_PATTERN
111        = Pattern.compile(
112            "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]"
113            + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]"
114            + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
115            + "|[1-9][0-9]|[0-9]))");
116
117    public static final Pattern DOMAIN_NAME_PATTERN
118        = Pattern.compile(
119            "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+"
120            + TOP_LEVEL_DOMAIN_PATTERN + ")|"
121            + IP_ADDRESS_PATTERN + ")");
122
123    public static final Pattern EMAIL_ADDRESS_PATTERN
124        = Pattern.compile(
125            "[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}" +
126            "\\@" +
127            "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" +
128            "(" +
129                "\\." +
130                "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" +
131            ")+"
132        );
133
134    /**
135     * This pattern is intended for searching for things that look like they
136     * might be phone numbers in arbitrary text, not for validating whether
137     * something is in fact a phone number.  It will miss many things that
138     * are legitimate phone numbers.
139     *
140     * <p> The pattern matches the following:
141     * <ul>
142     * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
143     * may follow.
144     * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
145     * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
146     * </ul>
147     */
148    public static final Pattern PHONE_PATTERN
149        = Pattern.compile(                                  // sdd = space, dot, or dash
150                "(\\+[0-9]+[\\- \\.]*)?"                    // +<digits><sdd>*
151                + "(\\([0-9]+\\)[\\- \\.]*)?"               // (<digits>)<sdd>*
152                + "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit>
153
154    /**
155     *  Convenience method to take all of the non-null matching groups in a
156     *  regex Matcher and return them as a concatenated string.
157     *
158     *  @param matcher      The Matcher object from which grouped text will
159     *                      be extracted
160     *
161     *  @return             A String comprising all of the non-null matched
162     *                      groups concatenated together
163     */
164    public static final String concatGroups(Matcher matcher) {
165        StringBuilder b = new StringBuilder();
166        final int numGroups = matcher.groupCount();
167
168        for (int i = 1; i <= numGroups; i++) {
169            String s = matcher.group(i);
170
171            System.err.println("Group(" + i + ") : " + s);
172
173            if (s != null) {
174                b.append(s);
175            }
176        }
177
178        return b.toString();
179    }
180
181    /**
182     * Convenience method to return only the digits and plus signs
183     * in the matching string.
184     *
185     * @param matcher      The Matcher object from which digits and plus will
186     *                     be extracted
187     *
188     * @return             A String comprising all of the digits and plus in
189     *                     the match
190     */
191    public static final String digitsAndPlusOnly(Matcher matcher) {
192        StringBuilder buffer = new StringBuilder();
193        String matchingRegion = matcher.group();
194
195        for (int i = 0, size = matchingRegion.length(); i < size; i++) {
196            char character = matchingRegion.charAt(i);
197
198            if (character == '+' || Character.isDigit(character)) {
199                buffer.append(character);
200            }
201        }
202        return buffer.toString();
203    }
204}
205