1/* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.text.util; 18 19import java.util.regex.Matcher; 20import java.util.regex.Pattern; 21 22/** 23 * @hide 24 */ 25public class Regex { 26 /** 27 * Regular expression pattern to match all IANA top-level domains. 28 * List accurate as of 2007/06/15. List taken from: 29 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 30 * This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py 31 */ 32 public static final Pattern TOP_LEVEL_DOMAIN_PATTERN 33 = Pattern.compile( 34 "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" 35 + "|(biz|b[abdefghijmnorstvwyz])" 36 + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" 37 + "|d[ejkmoz]" 38 + "|(edu|e[cegrstu])" 39 + "|f[ijkmor]" 40 + "|(gov|g[abdefghilmnpqrstuwy])" 41 + "|h[kmnrtu]" 42 + "|(info|int|i[delmnoqrst])" 43 + "|(jobs|j[emop])" 44 + "|k[eghimnrwyz]" 45 + "|l[abcikrstuvy]" 46 + "|(mil|mobi|museum|m[acdghklmnopqrstuvwxyz])" 47 + "|(name|net|n[acefgilopruz])" 48 + "|(org|om)" 49 + "|(pro|p[aefghklmnrstwy])" 50 + "|qa" 51 + "|r[eouw]" 52 + "|s[abcdeghijklmnortuvyz]" 53 + "|(tel|travel|t[cdfghjklmnoprtvwz])" 54 + "|u[agkmsyz]" 55 + "|v[aceginu]" 56 + "|w[fs]" 57 + "|y[etu]" 58 + "|z[amw])"); 59 60 /** 61 * Regular expression pattern to match RFC 1738 URLs 62 * List accurate as of 2007/06/15. List taken from: 63 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 64 * This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py 65 */ 66 public static final Pattern WEB_URL_PATTERN 67 = Pattern.compile( 68 "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" 69 + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" 70 + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" 71 + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host 72 + "(?:" // plus top level domain 73 + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" 74 + "|(?:biz|b[abdefghijmnorstvwyz])" 75 + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" 76 + "|d[ejkmoz]" 77 + "|(?:edu|e[cegrstu])" 78 + "|f[ijkmor]" 79 + "|(?:gov|g[abdefghilmnpqrstuwy])" 80 + "|h[kmnrtu]" 81 + "|(?:info|int|i[delmnoqrst])" 82 + "|(?:jobs|j[emop])" 83 + "|k[eghimnrwyz]" 84 + "|l[abcikrstuvy]" 85 + "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])" 86 + "|(?:name|net|n[acefgilopruz])" 87 + "|(?:org|om)" 88 + "|(?:pro|p[aefghklmnrstwy])" 89 + "|qa" 90 + "|r[eouw]" 91 + "|s[abcdeghijklmnortuvyz]" 92 + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" 93 + "|u[agkmsyz]" 94 + "|v[aceginu]" 95 + "|w[fs]" 96 + "|y[etu]" 97 + "|z[amw]))" 98 + "|(?:(?:25[0-5]|2[0-4]" // or ip address 99 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" 100 + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" 101 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" 102 + "|[1-9][0-9]|[0-9])))" 103 + "(?:\\:\\d{1,5})?)" // plus option port number 104 + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params 105 + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" 106 + "(?:\\b|$)"); // and finally, a word boundary or end of 107 // input. This is to stop foo.sure from 108 // matching as foo.su 109 110 public static final Pattern IP_ADDRESS_PATTERN 111 = Pattern.compile( 112 "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]" 113 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]" 114 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" 115 + "|[1-9][0-9]|[0-9]))"); 116 117 public static final Pattern DOMAIN_NAME_PATTERN 118 = Pattern.compile( 119 "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" 120 + TOP_LEVEL_DOMAIN_PATTERN + ")|" 121 + IP_ADDRESS_PATTERN + ")"); 122 123 public static final Pattern EMAIL_ADDRESS_PATTERN 124 = Pattern.compile( 125 "[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}" + 126 "\\@" + 127 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" + 128 "(" + 129 "\\." + 130 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" + 131 ")+" 132 ); 133 134 /** 135 * This pattern is intended for searching for things that look like they 136 * might be phone numbers in arbitrary text, not for validating whether 137 * something is in fact a phone number. It will miss many things that 138 * are legitimate phone numbers. 139 * 140 * <p> The pattern matches the following: 141 * <ul> 142 * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes 143 * may follow. 144 * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes. 145 * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes. 146 * </ul> 147 */ 148 public static final Pattern PHONE_PATTERN 149 = Pattern.compile( // sdd = space, dot, or dash 150 "(\\+[0-9]+[\\- \\.]*)?" // +<digits><sdd>* 151 + "(\\([0-9]+\\)[\\- \\.]*)?" // (<digits>)<sdd>* 152 + "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit> 153 154 /** 155 * Convenience method to take all of the non-null matching groups in a 156 * regex Matcher and return them as a concatenated string. 157 * 158 * @param matcher The Matcher object from which grouped text will 159 * be extracted 160 * 161 * @return A String comprising all of the non-null matched 162 * groups concatenated together 163 */ 164 public static final String concatGroups(Matcher matcher) { 165 StringBuilder b = new StringBuilder(); 166 final int numGroups = matcher.groupCount(); 167 168 for (int i = 1; i <= numGroups; i++) { 169 String s = matcher.group(i); 170 171 System.err.println("Group(" + i + ") : " + s); 172 173 if (s != null) { 174 b.append(s); 175 } 176 } 177 178 return b.toString(); 179 } 180 181 /** 182 * Convenience method to return only the digits and plus signs 183 * in the matching string. 184 * 185 * @param matcher The Matcher object from which digits and plus will 186 * be extracted 187 * 188 * @return A String comprising all of the digits and plus in 189 * the match 190 */ 191 public static final String digitsAndPlusOnly(Matcher matcher) { 192 StringBuilder buffer = new StringBuilder(); 193 String matchingRegion = matcher.group(); 194 195 for (int i = 0, size = matchingRegion.length(); i < size; i++) { 196 char character = matchingRegion.charAt(i); 197 198 if (character == '+' || Character.isDigit(character)) { 199 buffer.append(character); 200 } 201 } 202 return buffer.toString(); 203 } 204} 205