1/* 2 * Copyright (C) 2011 The Libphonenumber Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.i18n.phonenumbers; 18 19import com.android.i18n.phonenumbers.PhoneNumberUtil.Leniency; 20import com.android.i18n.phonenumbers.Phonenumber.PhoneNumber; 21 22import java.lang.Character.UnicodeBlock; 23import java.util.Iterator; 24import java.util.NoSuchElementException; 25import java.util.regex.Matcher; 26import java.util.regex.Pattern; 27 28/** 29 * A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}. 30 * Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in 31 * {@link PhoneNumberUtil}. 32 * 33 * <p>Vanity numbers (phone numbers using alphabetic digits such as <tt>1-800-SIX-FLAGS</tt> are 34 * not found. 35 * 36 * <p>This class is not thread-safe. 37 * 38 * @author Tom Hofmann 39 */ 40final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> { 41 /** 42 * The phone number pattern used by {@link #find}, similar to 43 * {@code PhoneNumberUtil.VALID_PHONE_NUMBER}, but with the following differences: 44 * <ul> 45 * <li>All captures are limited in order to place an upper bound to the text matched by the 46 * pattern. 47 * <ul> 48 * <li>Leading punctuation / plus signs are limited. 49 * <li>Consecutive occurrences of punctuation are limited. 50 * <li>Number of digits is limited. 51 * </ul> 52 * <li>No whitespace is allowed at the start or end. 53 * <li>No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported. 54 * </ul> 55 */ 56 private static final Pattern PATTERN; 57 /** 58 * Matches strings that look like publication pages. Example: 59 * <pre>Computing Complete Answers to Queries in the Presence of Limited Access Patterns. 60 * Chen Li. VLDB J. 12(3): 211-227 (2003).</pre> 61 * 62 * The string "211-227 (2003)" is not a telephone number. 63 */ 64 private static final Pattern PUB_PAGES = Pattern.compile("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"); 65 66 /** 67 * Matches strings that look like dates using "/" as a separator. Examples: 3/10/2011, 31/10/96 or 68 * 08/31/95. 69 */ 70 private static final Pattern SLASH_SEPARATED_DATES = 71 Pattern.compile("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"); 72 73 /** 74 * Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does not include the 75 * trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX. 76 */ 77 private static final Pattern TIME_STAMPS = 78 Pattern.compile("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$"); 79 private static final Pattern TIME_STAMPS_SUFFIX = Pattern.compile(":[0-5]\\d"); 80 81 /** 82 * Pattern to check that brackets match. Opening brackets should be closed within a phone number. 83 * This also checks that there is something inside the brackets. Having no brackets at all is also 84 * fine. 85 */ 86 private static final Pattern MATCHING_BRACKETS; 87 88 /** 89 * Matches white-space, which may indicate the end of a phone number and the start of something 90 * else (such as a neighbouring zip-code). If white-space is found, continues to match all 91 * characters that are not typically used to start a phone number. 92 */ 93 private static final Pattern GROUP_SEPARATOR; 94 95 /** 96 * Punctuation that may be at the start of a phone number - brackets and plus signs. 97 */ 98 private static final Pattern LEAD_CLASS; 99 100 static { 101 /* Builds the MATCHING_BRACKETS and PATTERN regular expressions. The building blocks below exist 102 * to make the pattern more easily understood. */ 103 104 String openingParens = "(\\[\uFF08\uFF3B"; 105 String closingParens = ")\\]\uFF09\uFF3D"; 106 String nonParens = "[^" + openingParens + closingParens + "]"; 107 108 /* Limit on the number of pairs of brackets in a phone number. */ 109 String bracketPairLimit = limit(0, 3); 110 /* 111 * An opening bracket at the beginning may not be closed, but subsequent ones should be. It's 112 * also possible that the leading bracket was dropped, so we shouldn't be surprised if we see a 113 * closing bracket first. We limit the sets of brackets in a phone number to four. 114 */ 115 MATCHING_BRACKETS = Pattern.compile( 116 "(?:[" + openingParens + "])?" + "(?:" + nonParens + "+" + "[" + closingParens + "])?" + 117 nonParens + "+" + 118 "(?:[" + openingParens + "]" + nonParens + "+[" + closingParens + "])" + bracketPairLimit + 119 nonParens + "*"); 120 121 /* Limit on the number of leading (plus) characters. */ 122 String leadLimit = limit(0, 2); 123 /* Limit on the number of consecutive punctuation characters. */ 124 String punctuationLimit = limit(0, 4); 125 /* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a 126 * single block, set high enough to accommodate the entire national number and the international 127 * country code. */ 128 int digitBlockLimit = 129 PhoneNumberUtil.MAX_LENGTH_FOR_NSN + PhoneNumberUtil.MAX_LENGTH_COUNTRY_CODE; 130 /* Limit on the number of blocks separated by punctuation. Uses digitBlockLimit since some 131 * formats use spaces to separate each digit. */ 132 String blockLimit = limit(0, digitBlockLimit); 133 134 /* A punctuation sequence allowing white space. */ 135 String punctuation = "[" + PhoneNumberUtil.VALID_PUNCTUATION + "]" + punctuationLimit; 136 /* A digits block without punctuation. */ 137 String digitSequence = "\\p{Nd}" + limit(1, digitBlockLimit); 138 139 String leadClassChars = openingParens + PhoneNumberUtil.PLUS_CHARS; 140 String leadClass = "[" + leadClassChars + "]"; 141 LEAD_CLASS = Pattern.compile(leadClass); 142 GROUP_SEPARATOR = Pattern.compile("\\p{Z}" + "[^" + leadClassChars + "\\p{Nd}]*"); 143 144 /* Phone number pattern allowing optional punctuation. */ 145 PATTERN = Pattern.compile( 146 "(?:" + leadClass + punctuation + ")" + leadLimit + 147 digitSequence + "(?:" + punctuation + digitSequence + ")" + blockLimit + 148 "(?:" + PhoneNumberUtil.EXTN_PATTERNS_FOR_MATCHING + ")?", 149 PhoneNumberUtil.REGEX_FLAGS); 150 } 151 152 /** Returns a regular expression quantifier with an upper and lower limit. */ 153 private static String limit(int lower, int upper) { 154 if ((lower < 0) || (upper <= 0) || (upper < lower)) { 155 throw new IllegalArgumentException(); 156 } 157 return "{" + lower + "," + upper + "}"; 158 } 159 160 /** The potential states of a PhoneNumberMatcher. */ 161 private enum State { 162 NOT_READY, READY, DONE 163 } 164 165 /** The phone number utility. */ 166 private final PhoneNumberUtil phoneUtil; 167 /** The text searched for phone numbers. */ 168 private final CharSequence text; 169 /** 170 * The region (country) to assume for phone numbers without an international prefix, possibly 171 * null. 172 */ 173 private final String preferredRegion; 174 /** The degree of validation requested. */ 175 private final Leniency leniency; 176 /** The maximum number of retries after matching an invalid number. */ 177 private long maxTries; 178 179 /** The iteration tristate. */ 180 private State state = State.NOT_READY; 181 /** The last successful match, null unless in {@link State#READY}. */ 182 private PhoneNumberMatch lastMatch = null; 183 /** The next index to start searching at. Undefined in {@link State#DONE}. */ 184 private int searchIndex = 0; 185 186 /** 187 * Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a 188 * new instance. 189 * 190 * @param util the phone number util to use 191 * @param text the character sequence that we will search, null for no text 192 * @param country the country to assume for phone numbers not written in international format 193 * (with a leading plus, or with the international dialing prefix of the 194 * specified region). May be null or "ZZ" if only numbers with a 195 * leading plus should be considered. 196 * @param leniency the leniency to use when evaluating candidate phone numbers 197 * @param maxTries the maximum number of invalid numbers to try before giving up on the text. 198 * This is to cover degenerate cases where the text has a lot of false positives 199 * in it. Must be {@code >= 0}. 200 */ 201 PhoneNumberMatcher(PhoneNumberUtil util, CharSequence text, String country, Leniency leniency, 202 long maxTries) { 203 204 if ((util == null) || (leniency == null)) { 205 throw new NullPointerException(); 206 } 207 if (maxTries < 0) { 208 throw new IllegalArgumentException(); 209 } 210 this.phoneUtil = util; 211 this.text = (text != null) ? text : ""; 212 this.preferredRegion = country; 213 this.leniency = leniency; 214 this.maxTries = maxTries; 215 } 216 217 public boolean hasNext() { 218 if (state == State.NOT_READY) { 219 lastMatch = find(searchIndex); 220 if (lastMatch == null) { 221 state = State.DONE; 222 } else { 223 searchIndex = lastMatch.end(); 224 state = State.READY; 225 } 226 } 227 return state == State.READY; 228 } 229 230 public PhoneNumberMatch next() { 231 // Check the state and find the next match as a side-effect if necessary. 232 if (!hasNext()) { 233 throw new NoSuchElementException(); 234 } 235 236 // Don't retain that memory any longer than necessary. 237 PhoneNumberMatch result = lastMatch; 238 lastMatch = null; 239 state = State.NOT_READY; 240 return result; 241 } 242 243 /** 244 * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex} 245 * that represents a phone number. Returns the next match, null if none was found. 246 * 247 * @param index the search index to start searching at 248 * @return the phone number match found, null if none can be found 249 */ 250 private PhoneNumberMatch find(int index) { 251 Matcher matcher = PATTERN.matcher(text); 252 while ((maxTries > 0) && matcher.find(index)) { 253 int start = matcher.start(); 254 CharSequence candidate = text.subSequence(start, matcher.end()); 255 256 // Check for extra numbers at the end. 257 // TODO: This is the place to start when trying to support extraction of multiple phone number 258 // from split notations (+41 79 123 45 67 / 68). 259 candidate = trimAfterFirstMatch(PhoneNumberUtil.SECOND_NUMBER_START_PATTERN, candidate); 260 261 PhoneNumberMatch match = extractMatch(candidate, start); 262 if (match != null) { 263 return match; 264 } 265 266 index = start + candidate.length(); 267 maxTries--; 268 } 269 270 return null; 271 } 272 273 /** 274 * Trims away any characters after the first match of {@code pattern} in {@code candidate}, 275 * returning the trimmed version. 276 */ 277 private static CharSequence trimAfterFirstMatch(Pattern pattern, CharSequence candidate) { 278 Matcher trailingCharsMatcher = pattern.matcher(candidate); 279 if (trailingCharsMatcher.find()) { 280 candidate = candidate.subSequence(0, trailingCharsMatcher.start()); 281 } 282 return candidate; 283 } 284 285 /** 286 * Helper method to determine if a character is a Latin-script letter or not. For our purposes, 287 * combining marks should also return true since we assume they have been added to a preceding 288 * Latin character. 289 */ 290 // @VisibleForTesting 291 static boolean isLatinLetter(char letter) { 292 // Combining marks are a subset of non-spacing-mark. 293 if (!Character.isLetter(letter) && Character.getType(letter) != Character.NON_SPACING_MARK) { 294 return false; 295 } 296 UnicodeBlock block = UnicodeBlock.of(letter); 297 return block.equals(UnicodeBlock.BASIC_LATIN) || 298 block.equals(UnicodeBlock.LATIN_1_SUPPLEMENT) || 299 block.equals(UnicodeBlock.LATIN_EXTENDED_A) || 300 block.equals(UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) || 301 block.equals(UnicodeBlock.LATIN_EXTENDED_B) || 302 block.equals(UnicodeBlock.COMBINING_DIACRITICAL_MARKS); 303 } 304 305 private static boolean isInvalidPunctuationSymbol(char character) { 306 return character == '%' || Character.getType(character) == Character.CURRENCY_SYMBOL; 307 } 308 309 /** 310 * Attempts to extract a match from a {@code candidate} character sequence. 311 * 312 * @param candidate the candidate text that might contain a phone number 313 * @param offset the offset of {@code candidate} within {@link #text} 314 * @return the match found, null if none can be found 315 */ 316 private PhoneNumberMatch extractMatch(CharSequence candidate, int offset) { 317 // Skip a match that is more likely a publication page reference or a date. 318 if (PUB_PAGES.matcher(candidate).find() || SLASH_SEPARATED_DATES.matcher(candidate).find()) { 319 return null; 320 } 321 // Skip potential time-stamps. 322 if (TIME_STAMPS.matcher(candidate).find()) { 323 String followingText = text.toString().substring(offset + candidate.length()); 324 if (TIME_STAMPS_SUFFIX.matcher(followingText).lookingAt()) { 325 return null; 326 } 327 } 328 329 // Try to come up with a valid match given the entire candidate. 330 String rawString = candidate.toString(); 331 PhoneNumberMatch match = parseAndVerify(rawString, offset); 332 if (match != null) { 333 return match; 334 } 335 336 // If that failed, try to find an "inner match" - there might be a phone number within this 337 // candidate. 338 return extractInnerMatch(rawString, offset); 339 } 340 341 /** 342 * Attempts to extract a match from {@code candidate} if the whole candidate does not qualify as a 343 * match. 344 * 345 * @param candidate the candidate text that might contain a phone number 346 * @param offset the current offset of {@code candidate} within {@link #text} 347 * @return the match found, null if none can be found 348 */ 349 private PhoneNumberMatch extractInnerMatch(String candidate, int offset) { 350 // Try removing either the first or last "group" in the number and see if this gives a result. 351 // We consider white space to be a possible indication of the start or end of the phone number. 352 Matcher groupMatcher = GROUP_SEPARATOR.matcher(candidate); 353 354 if (groupMatcher.find()) { 355 // Try the first group by itself. 356 CharSequence firstGroupOnly = candidate.substring(0, groupMatcher.start()); 357 firstGroupOnly = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN, 358 firstGroupOnly); 359 PhoneNumberMatch match = parseAndVerify(firstGroupOnly.toString(), offset); 360 if (match != null) { 361 return match; 362 } 363 maxTries--; 364 365 int withoutFirstGroupStart = groupMatcher.end(); 366 // Try the rest of the candidate without the first group. 367 CharSequence withoutFirstGroup = candidate.substring(withoutFirstGroupStart); 368 withoutFirstGroup = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN, 369 withoutFirstGroup); 370 match = parseAndVerify(withoutFirstGroup.toString(), offset + withoutFirstGroupStart); 371 if (match != null) { 372 return match; 373 } 374 maxTries--; 375 376 if (maxTries > 0) { 377 int lastGroupStart = withoutFirstGroupStart; 378 while (groupMatcher.find()) { 379 // Find the last group. 380 lastGroupStart = groupMatcher.start(); 381 } 382 CharSequence withoutLastGroup = candidate.substring(0, lastGroupStart); 383 withoutLastGroup = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN, 384 withoutLastGroup); 385 if (withoutLastGroup.equals(firstGroupOnly)) { 386 // If there are only two groups, then the group "without the last group" is the same as 387 // the first group. In these cases, we don't want to re-check the number group, so we exit 388 // already. 389 return null; 390 } 391 match = parseAndVerify(withoutLastGroup.toString(), offset); 392 if (match != null) { 393 return match; 394 } 395 maxTries--; 396 } 397 } 398 return null; 399 } 400 401 /** 402 * Parses a phone number from the {@code candidate} using {@link PhoneNumberUtil#parse} and 403 * verifies it matches the requested {@link #leniency}. If parsing and verification succeed, a 404 * corresponding {@link PhoneNumberMatch} is returned, otherwise this method returns null. 405 * 406 * @param candidate the candidate match 407 * @param offset the offset of {@code candidate} within {@link #text} 408 * @return the parsed and validated phone number match, or null 409 */ 410 private PhoneNumberMatch parseAndVerify(String candidate, int offset) { 411 try { 412 // Check the candidate doesn't contain any formatting which would indicate that it really 413 // isn't a phone number. 414 if (!MATCHING_BRACKETS.matcher(candidate).matches()) { 415 return null; 416 } 417 418 // If leniency is set to VALID or stricter, we also want to skip numbers that are surrounded 419 // by Latin alphabetic characters, to skip cases like abc8005001234 or 8005001234def. 420 if (leniency.compareTo(Leniency.VALID) >= 0) { 421 // If the candidate is not at the start of the text, and does not start with phone-number 422 // punctuation, check the previous character. 423 if (offset > 0 && !LEAD_CLASS.matcher(candidate).lookingAt()) { 424 char previousChar = text.charAt(offset - 1); 425 // We return null if it is a latin letter or an invalid punctuation symbol. 426 if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) { 427 return null; 428 } 429 } 430 int lastCharIndex = offset + candidate.length(); 431 if (lastCharIndex < text.length()) { 432 char nextChar = text.charAt(lastCharIndex); 433 if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) { 434 return null; 435 } 436 } 437 } 438 439 PhoneNumber number = phoneUtil.parseAndKeepRawInput(candidate, preferredRegion); 440 if (leniency.verify(number, candidate, phoneUtil)) { 441 // We used parseAndKeepRawInput to create this number, but for now we don't return the extra 442 // values parsed. TODO: stop clearing all values here and switch all users over 443 // to using rawInput() rather than the rawString() of PhoneNumberMatch. 444 number.clearCountryCodeSource(); 445 number.clearRawInput(); 446 number.clearPreferredDomesticCarrierCode(); 447 return new PhoneNumberMatch(offset, candidate, number); 448 } 449 } catch (NumberParseException e) { 450 // ignore and continue 451 } 452 return null; 453 } 454 455 /** 456 * Always throws {@link UnsupportedOperationException} as removal is not supported. 457 */ 458 public void remove() { 459 throw new UnsupportedOperationException(); 460 } 461} 462