1/*
2 * Copyright (C) 2011 The Libphonenumber Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.i18n.phonenumbers;
18
19import com.android.i18n.phonenumbers.PhoneNumberUtil.Leniency;
20import com.android.i18n.phonenumbers.Phonenumber.PhoneNumber;
21
22import java.lang.Character.UnicodeBlock;
23import java.util.Iterator;
24import java.util.NoSuchElementException;
25import java.util.regex.Matcher;
26import java.util.regex.Pattern;
27
28/**
29 * A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}.
30 * Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in
31 * {@link PhoneNumberUtil}.
32 *
33 * <p>Vanity numbers (phone numbers using alphabetic digits such as <tt>1-800-SIX-FLAGS</tt> are
34 * not found.
35 *
36 * <p>This class is not thread-safe.
37 *
38 * @author Tom Hofmann
39 */
40final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> {
41  /**
42   * The phone number pattern used by {@link #find}, similar to
43   * {@code PhoneNumberUtil.VALID_PHONE_NUMBER}, but with the following differences:
44   * <ul>
45   *   <li>All captures are limited in order to place an upper bound to the text matched by the
46   *       pattern.
47   * <ul>
48   *   <li>Leading punctuation / plus signs are limited.
49   *   <li>Consecutive occurrences of punctuation are limited.
50   *   <li>Number of digits is limited.
51   * </ul>
52   *   <li>No whitespace is allowed at the start or end.
53   *   <li>No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported.
54   * </ul>
55   */
56  private static final Pattern PATTERN;
57  /**
58   * Matches strings that look like publication pages. Example:
59   * <pre>Computing Complete Answers to Queries in the Presence of Limited Access Patterns.
60   * Chen Li. VLDB J. 12(3): 211-227 (2003).</pre>
61   *
62   * The string "211-227 (2003)" is not a telephone number.
63   */
64  private static final Pattern PUB_PAGES = Pattern.compile("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}");
65
66  /**
67   * Matches strings that look like dates using "/" as a separator. Examples: 3/10/2011, 31/10/96 or
68   * 08/31/95.
69   */
70  private static final Pattern SLASH_SEPARATED_DATES =
71      Pattern.compile("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}");
72
73  /**
74   * Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does not include the
75   * trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
76   */
77  private static final Pattern TIME_STAMPS =
78      Pattern.compile("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$");
79  private static final Pattern TIME_STAMPS_SUFFIX = Pattern.compile(":[0-5]\\d");
80
81  /**
82   * Pattern to check that brackets match. Opening brackets should be closed within a phone number.
83   * This also checks that there is something inside the brackets. Having no brackets at all is also
84   * fine.
85   */
86  private static final Pattern MATCHING_BRACKETS;
87
88  /**
89   * Matches white-space, which may indicate the end of a phone number and the start of something
90   * else (such as a neighbouring zip-code). If white-space is found, continues to match all
91   * characters that are not typically used to start a phone number.
92   */
93  private static final Pattern GROUP_SEPARATOR;
94
95  /**
96   * Punctuation that may be at the start of a phone number - brackets and plus signs.
97   */
98  private static final Pattern LEAD_CLASS;
99
100  static {
101    /* Builds the MATCHING_BRACKETS and PATTERN regular expressions. The building blocks below exist
102     * to make the pattern more easily understood. */
103
104    String openingParens = "(\\[\uFF08\uFF3B";
105    String closingParens = ")\\]\uFF09\uFF3D";
106    String nonParens = "[^" + openingParens + closingParens + "]";
107
108    /* Limit on the number of pairs of brackets in a phone number. */
109    String bracketPairLimit = limit(0, 3);
110    /*
111     * An opening bracket at the beginning may not be closed, but subsequent ones should be.  It's
112     * also possible that the leading bracket was dropped, so we shouldn't be surprised if we see a
113     * closing bracket first. We limit the sets of brackets in a phone number to four.
114     */
115    MATCHING_BRACKETS = Pattern.compile(
116        "(?:[" + openingParens + "])?" + "(?:" + nonParens + "+" + "[" + closingParens + "])?" +
117        nonParens + "+" +
118        "(?:[" + openingParens + "]" + nonParens + "+[" + closingParens + "])" + bracketPairLimit +
119        nonParens + "*");
120
121    /* Limit on the number of leading (plus) characters. */
122    String leadLimit = limit(0, 2);
123    /* Limit on the number of consecutive punctuation characters. */
124    String punctuationLimit = limit(0, 4);
125    /* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a
126     * single block, set high enough to accommodate the entire national number and the international
127     * country code. */
128    int digitBlockLimit =
129        PhoneNumberUtil.MAX_LENGTH_FOR_NSN + PhoneNumberUtil.MAX_LENGTH_COUNTRY_CODE;
130    /* Limit on the number of blocks separated by punctuation. Uses digitBlockLimit since some
131     * formats use spaces to separate each digit. */
132    String blockLimit = limit(0, digitBlockLimit);
133
134    /* A punctuation sequence allowing white space. */
135    String punctuation = "[" + PhoneNumberUtil.VALID_PUNCTUATION + "]" + punctuationLimit;
136    /* A digits block without punctuation. */
137    String digitSequence = "\\p{Nd}" + limit(1, digitBlockLimit);
138
139    String leadClassChars = openingParens + PhoneNumberUtil.PLUS_CHARS;
140    String leadClass = "[" + leadClassChars + "]";
141    LEAD_CLASS = Pattern.compile(leadClass);
142    GROUP_SEPARATOR = Pattern.compile("\\p{Z}" + "[^" + leadClassChars  + "\\p{Nd}]*");
143
144    /* Phone number pattern allowing optional punctuation. */
145    PATTERN = Pattern.compile(
146        "(?:" + leadClass + punctuation + ")" + leadLimit +
147        digitSequence + "(?:" + punctuation + digitSequence + ")" + blockLimit +
148        "(?:" + PhoneNumberUtil.EXTN_PATTERNS_FOR_MATCHING + ")?",
149        PhoneNumberUtil.REGEX_FLAGS);
150  }
151
152  /** Returns a regular expression quantifier with an upper and lower limit. */
153  private static String limit(int lower, int upper) {
154    if ((lower < 0) || (upper <= 0) || (upper < lower)) {
155      throw new IllegalArgumentException();
156    }
157    return "{" + lower + "," + upper + "}";
158  }
159
160  /** The potential states of a PhoneNumberMatcher. */
161  private enum State {
162    NOT_READY, READY, DONE
163  }
164
165  /** The phone number utility. */
166  private final PhoneNumberUtil phoneUtil;
167  /** The text searched for phone numbers. */
168  private final CharSequence text;
169  /**
170   * The region (country) to assume for phone numbers without an international prefix, possibly
171   * null.
172   */
173  private final String preferredRegion;
174  /** The degree of validation requested. */
175  private final Leniency leniency;
176  /** The maximum number of retries after matching an invalid number. */
177  private long maxTries;
178
179  /** The iteration tristate. */
180  private State state = State.NOT_READY;
181  /** The last successful match, null unless in {@link State#READY}. */
182  private PhoneNumberMatch lastMatch = null;
183  /** The next index to start searching at. Undefined in {@link State#DONE}. */
184  private int searchIndex = 0;
185
186  /**
187   * Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a
188   * new instance.
189   *
190   * @param util      the phone number util to use
191   * @param text      the character sequence that we will search, null for no text
192   * @param country   the country to assume for phone numbers not written in international format
193   *                  (with a leading plus, or with the international dialing prefix of the
194   *                  specified region). May be null or "ZZ" if only numbers with a
195   *                  leading plus should be considered.
196   * @param leniency  the leniency to use when evaluating candidate phone numbers
197   * @param maxTries  the maximum number of invalid numbers to try before giving up on the text.
198   *                  This is to cover degenerate cases where the text has a lot of false positives
199   *                  in it. Must be {@code >= 0}.
200   */
201  PhoneNumberMatcher(PhoneNumberUtil util, CharSequence text, String country, Leniency leniency,
202      long maxTries) {
203
204    if ((util == null) || (leniency == null)) {
205      throw new NullPointerException();
206    }
207    if (maxTries < 0) {
208      throw new IllegalArgumentException();
209    }
210    this.phoneUtil = util;
211    this.text = (text != null) ? text : "";
212    this.preferredRegion = country;
213    this.leniency = leniency;
214    this.maxTries = maxTries;
215  }
216
217  public boolean hasNext() {
218    if (state == State.NOT_READY) {
219      lastMatch = find(searchIndex);
220      if (lastMatch == null) {
221        state = State.DONE;
222      } else {
223        searchIndex = lastMatch.end();
224        state = State.READY;
225      }
226    }
227    return state == State.READY;
228  }
229
230  public PhoneNumberMatch next() {
231    // Check the state and find the next match as a side-effect if necessary.
232    if (!hasNext()) {
233      throw new NoSuchElementException();
234    }
235
236    // Don't retain that memory any longer than necessary.
237    PhoneNumberMatch result = lastMatch;
238    lastMatch = null;
239    state = State.NOT_READY;
240    return result;
241  }
242
243  /**
244   * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex}
245   * that represents a phone number. Returns the next match, null if none was found.
246   *
247   * @param index  the search index to start searching at
248   * @return  the phone number match found, null if none can be found
249   */
250  private PhoneNumberMatch find(int index) {
251    Matcher matcher = PATTERN.matcher(text);
252    while ((maxTries > 0) && matcher.find(index)) {
253      int start = matcher.start();
254      CharSequence candidate = text.subSequence(start, matcher.end());
255
256      // Check for extra numbers at the end.
257      // TODO: This is the place to start when trying to support extraction of multiple phone number
258      // from split notations (+41 79 123 45 67 / 68).
259      candidate = trimAfterFirstMatch(PhoneNumberUtil.SECOND_NUMBER_START_PATTERN, candidate);
260
261      PhoneNumberMatch match = extractMatch(candidate, start);
262      if (match != null) {
263        return match;
264      }
265
266      index = start + candidate.length();
267      maxTries--;
268    }
269
270    return null;
271  }
272
273  /**
274   * Trims away any characters after the first match of {@code pattern} in {@code candidate},
275   * returning the trimmed version.
276   */
277  private static CharSequence trimAfterFirstMatch(Pattern pattern, CharSequence candidate) {
278    Matcher trailingCharsMatcher = pattern.matcher(candidate);
279    if (trailingCharsMatcher.find()) {
280      candidate = candidate.subSequence(0, trailingCharsMatcher.start());
281    }
282    return candidate;
283  }
284
285  /**
286   * Helper method to determine if a character is a Latin-script letter or not. For our purposes,
287   * combining marks should also return true since we assume they have been added to a preceding
288   * Latin character.
289   */
290  // @VisibleForTesting
291  static boolean isLatinLetter(char letter) {
292    // Combining marks are a subset of non-spacing-mark.
293    if (!Character.isLetter(letter) && Character.getType(letter) != Character.NON_SPACING_MARK) {
294      return false;
295    }
296    UnicodeBlock block = UnicodeBlock.of(letter);
297    return block.equals(UnicodeBlock.BASIC_LATIN) ||
298        block.equals(UnicodeBlock.LATIN_1_SUPPLEMENT) ||
299        block.equals(UnicodeBlock.LATIN_EXTENDED_A) ||
300        block.equals(UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) ||
301        block.equals(UnicodeBlock.LATIN_EXTENDED_B) ||
302        block.equals(UnicodeBlock.COMBINING_DIACRITICAL_MARKS);
303  }
304
305  private static boolean isInvalidPunctuationSymbol(char character) {
306    return character == '%' || Character.getType(character) == Character.CURRENCY_SYMBOL;
307  }
308
309  /**
310   * Attempts to extract a match from a {@code candidate} character sequence.
311   *
312   * @param candidate  the candidate text that might contain a phone number
313   * @param offset  the offset of {@code candidate} within {@link #text}
314   * @return  the match found, null if none can be found
315   */
316  private PhoneNumberMatch extractMatch(CharSequence candidate, int offset) {
317    // Skip a match that is more likely a publication page reference or a date.
318    if (PUB_PAGES.matcher(candidate).find() || SLASH_SEPARATED_DATES.matcher(candidate).find()) {
319      return null;
320    }
321    // Skip potential time-stamps.
322    if (TIME_STAMPS.matcher(candidate).find()) {
323      String followingText = text.toString().substring(offset + candidate.length());
324      if (TIME_STAMPS_SUFFIX.matcher(followingText).lookingAt()) {
325        return null;
326      }
327    }
328
329    // Try to come up with a valid match given the entire candidate.
330    String rawString = candidate.toString();
331    PhoneNumberMatch match = parseAndVerify(rawString, offset);
332    if (match != null) {
333      return match;
334    }
335
336    // If that failed, try to find an "inner match" - there might be a phone number within this
337    // candidate.
338    return extractInnerMatch(rawString, offset);
339  }
340
341  /**
342   * Attempts to extract a match from {@code candidate} if the whole candidate does not qualify as a
343   * match.
344   *
345   * @param candidate  the candidate text that might contain a phone number
346   * @param offset  the current offset of {@code candidate} within {@link #text}
347   * @return  the match found, null if none can be found
348   */
349  private PhoneNumberMatch extractInnerMatch(String candidate, int offset) {
350    // Try removing either the first or last "group" in the number and see if this gives a result.
351    // We consider white space to be a possible indication of the start or end of the phone number.
352    Matcher groupMatcher = GROUP_SEPARATOR.matcher(candidate);
353
354    if (groupMatcher.find()) {
355      // Try the first group by itself.
356      CharSequence firstGroupOnly = candidate.substring(0, groupMatcher.start());
357      firstGroupOnly = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN,
358                                           firstGroupOnly);
359      PhoneNumberMatch match = parseAndVerify(firstGroupOnly.toString(), offset);
360      if (match != null) {
361        return match;
362      }
363      maxTries--;
364
365      int withoutFirstGroupStart = groupMatcher.end();
366      // Try the rest of the candidate without the first group.
367      CharSequence withoutFirstGroup = candidate.substring(withoutFirstGroupStart);
368      withoutFirstGroup = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN,
369                                              withoutFirstGroup);
370      match = parseAndVerify(withoutFirstGroup.toString(), offset + withoutFirstGroupStart);
371      if (match != null) {
372        return match;
373      }
374      maxTries--;
375
376      if (maxTries > 0) {
377        int lastGroupStart = withoutFirstGroupStart;
378        while (groupMatcher.find()) {
379          // Find the last group.
380          lastGroupStart = groupMatcher.start();
381        }
382        CharSequence withoutLastGroup = candidate.substring(0, lastGroupStart);
383        withoutLastGroup = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN,
384                                               withoutLastGroup);
385        if (withoutLastGroup.equals(firstGroupOnly)) {
386          // If there are only two groups, then the group "without the last group" is the same as
387          // the first group. In these cases, we don't want to re-check the number group, so we exit
388          // already.
389          return null;
390        }
391        match = parseAndVerify(withoutLastGroup.toString(), offset);
392        if (match != null) {
393          return match;
394        }
395        maxTries--;
396      }
397    }
398    return null;
399  }
400
401  /**
402   * Parses a phone number from the {@code candidate} using {@link PhoneNumberUtil#parse} and
403   * verifies it matches the requested {@link #leniency}. If parsing and verification succeed, a
404   * corresponding {@link PhoneNumberMatch} is returned, otherwise this method returns null.
405   *
406   * @param candidate  the candidate match
407   * @param offset  the offset of {@code candidate} within {@link #text}
408   * @return  the parsed and validated phone number match, or null
409   */
410  private PhoneNumberMatch parseAndVerify(String candidate, int offset) {
411    try {
412      // Check the candidate doesn't contain any formatting which would indicate that it really
413      // isn't a phone number.
414      if (!MATCHING_BRACKETS.matcher(candidate).matches()) {
415        return null;
416      }
417
418      // If leniency is set to VALID or stricter, we also want to skip numbers that are surrounded
419      // by Latin alphabetic characters, to skip cases like abc8005001234 or 8005001234def.
420      if (leniency.compareTo(Leniency.VALID) >= 0) {
421        // If the candidate is not at the start of the text, and does not start with phone-number
422        // punctuation, check the previous character.
423        if (offset > 0 && !LEAD_CLASS.matcher(candidate).lookingAt()) {
424          char previousChar = text.charAt(offset - 1);
425          // We return null if it is a latin letter or an invalid punctuation symbol.
426          if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) {
427            return null;
428          }
429        }
430        int lastCharIndex = offset + candidate.length();
431        if (lastCharIndex < text.length()) {
432          char nextChar = text.charAt(lastCharIndex);
433          if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) {
434            return null;
435          }
436        }
437      }
438
439      PhoneNumber number = phoneUtil.parseAndKeepRawInput(candidate, preferredRegion);
440      if (leniency.verify(number, candidate, phoneUtil)) {
441        // We used parseAndKeepRawInput to create this number, but for now we don't return the extra
442        // values parsed. TODO: stop clearing all values here and switch all users over
443        // to using rawInput() rather than the rawString() of PhoneNumberMatch.
444        number.clearCountryCodeSource();
445        number.clearRawInput();
446        number.clearPreferredDomesticCarrierCode();
447        return new PhoneNumberMatch(offset, candidate, number);
448      }
449    } catch (NumberParseException e) {
450      // ignore and continue
451    }
452    return null;
453  }
454
455  /**
456   * Always throws {@link UnsupportedOperationException} as removal is not supported.
457   */
458  public void remove() {
459    throw new UnsupportedOperationException();
460  }
461}
462