16a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Copyright (C) 2011 The Libphonenumber Authors
26a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com//
36a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Licensed under the Apache License, Version 2.0 (the "License");
46a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// you may not use this file except in compliance with the License.
56a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// You may obtain a copy of the License at
66a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com//
76a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// http://www.apache.org/licenses/LICENSE-2.0
86a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com//
96a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Unless required by applicable law or agreed to in writing, software
106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// distributed under the License is distributed on an "AS IS" BASIS,
116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// See the License for the specific language governing permissions and
136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// limitations under the License.
146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com//
156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Author: Lara Rennie
166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Author: Tao Huang
176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com//
186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Implementation of a stateful class that finds and extracts telephone numbers
196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// from text.
206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumbermatcher.h"
226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
23fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#error phonenumbermatcher depends on ICU \
25fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com    (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#endif  // I18N_PHONENUMBERS_USE_ICU_REGEXP
276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
28603e7e5f83aad6e45e8d794c604a546936b77a16philip.liard@gmail.com#include <ctype.h>
297e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com#include <stddef.h>
306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include <limits>
31af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include <map>
326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include <string>
3335bd393fb78215a9c6dbeb158913def01eb58985lararennie@google.com#include <utility>
34603e7e5f83aad6e45e8d794c604a546936b77a16philip.liard@gmail.com#include <vector>
35603e7e5f83aad6e45e8d794c604a546936b77a16philip.liard@gmail.com
366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include <unicode/uchar.h>
376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
38b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com#include "phonenumbers/alternate_format.h"
39af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/logging.h"
40af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/memory/scoped_ptr.h"
41af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/memory/singleton.h"
42e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com#include "phonenumbers/callback.h"
436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/default_logger.h"
446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/encoding_utils.h"
456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/normalize_utf8.h"
461fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com#include "phonenumbers/phonemetadata.pb.h"
476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumber.pb.h"
486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumbermatch.h"
496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumberutil.h"
506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/regexp_adapter.h"
516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/regexp_adapter_icu.h"
526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/stringutil.h"
536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
54fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#ifdef I18N_PHONENUMBERS_USE_RE2
556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/regexp_adapter_re2.h"
56fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#endif  // I18N_PHONENUMBERS_USE_RE2_AND_ICU
576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
58b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.comusing std::make_pair;
59b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.comusing std::map;
606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comusing std::numeric_limits;
616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comusing std::string;
626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comusing std::vector;
636a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comnamespace i18n {
656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comnamespace phonenumbers {
666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comnamespace {
686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Returns a regular expression quantifier with an upper and lower limit.
696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comstring Limit(int lower, int upper) {
706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK_GE(lower, 0);
716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK_GT(upper, 0);
726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK_LT(lower, upper);
736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return StrCat("{", lower, ",", upper, "}");
746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
7686929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.combool IsInvalidPunctuationSymbol(char32 character) {
7786929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com  return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                             const PhoneNumberUtil& util) {
826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // The characters 'x' and 'X' can be (1) a carrier code, in which case they
836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // always precede the national significant number or (2) an extension sign,
846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // in which case they always precede the extension number. We assume a
856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // carrier code is more than 1 digit, so the first case has to have more than
866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
876a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // 'x' or 'X'.
886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  size_t found;
896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  found = candidate.find_first_of("xX");
906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // We ignore the character if 'x' or 'X' appears as the last character of
916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // the string.
926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  while (found != string::npos && found < candidate.length() - 1) {
936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    // We only look for 'x' or 'X' in ASCII form.
946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    char next_char = candidate[found + 1];
956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (next_char == 'x' || next_char == 'X') {
966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      // This is the carrier code case, in which the 'X's always precede the
976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      // national significant number.
986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      ++found;
996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      if (util.IsNumberMatchWithOneString(
1006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com              number, candidate.substr(found, candidate.length() - found))
1016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com          != PhoneNumberUtil::NSN_MATCH) {
1026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
1036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
1046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    } else {
1056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      string normalized_extension(candidate.substr(found,
1066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                                   candidate.length() - found));
1076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      util.NormalizeDigitsOnly(&normalized_extension);
1086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      if (normalized_extension != number.extension()) {
1096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
1106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
1116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
1126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    found = candidate.find_first_of("xX", found + 1);
1136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
1146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return true;
1156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
116e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com
117e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.combool AllNumberGroupsRemainGrouped(
118e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const PhoneNumberUtil& util,
119e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const PhoneNumber& phone_number,
120e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const string& normalized_candidate,
121e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const vector<string>& formatted_number_groups) {
122e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  size_t from_index = 0;
123e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // Check each group of consecutive digits are not broken into separate
124e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // groupings in the normalized_candidate string.
125e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
126e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // Fails if the substring of normalized_candidate starting from from_index
127e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // doesn't contain the consecutive digits in formatted_number_groups.at(i).
128e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    from_index = normalized_candidate.find(formatted_number_groups.at(i),
129e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                           from_index);
130e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    if (from_index == string::npos) {
131e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      return false;
132e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    }
133e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // Moves from_index forward.
134e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    from_index += formatted_number_groups.at(i).length();
135e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    if (i == 0 && from_index < normalized_candidate.length()) {
136b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // We are at the position right after the NDC. We get the region used for
137b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // formatting information based on the country code in the phone number,
138b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // rather than the number itself, as we do not need to distinguish between
139b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // different countries with the same country calling code and this is
140b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // faster.
141b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      string region;
142b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      util.GetRegionCodeForCountryCode(phone_number.country_code(), &region);
143b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      string ndd_prefix;
144b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      util.GetNddPrefixForRegion(region, true, &ndd_prefix);
145b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // Note although normalized_candidate might contain non-ASCII formatting
146b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // characters, they won't be treated as ASCII digits when converted to a
147b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      // char.
148b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com      if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
149e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        // This means there is no formatting symbol after the NDC. In this case,
150e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        // we only accept the number if there is no formatting symbol at all in
151b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com        // the number, except for extensions. This is only important for
152b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com        // countries with national prefixes.
153e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        string national_significant_number;
154e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        util.GetNationalSignificantNumber(
155e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com            phone_number, &national_significant_number);
156e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        return HasPrefixString(normalized_candidate.substr(
157e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com            from_index - formatted_number_groups.at(i).length()),
158e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com            national_significant_number);
159e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        }
160e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      }
161e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    }
162e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // The check here makes sure that we haven't mistakenly already used the
163e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // extension to match the last group of the subscriber number. Note the
164e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // extension cannot have formatting in-between digits.
165e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    return normalized_candidate.substr(from_index)
166e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        .find(phone_number.extension()) != string::npos;
167e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com}
168b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com
169b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.combool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
170de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com#if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
171b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  if (!alternate_formats->ParseFromArray(alternate_format_get(),
172b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com                                         alternate_format_size())) {
1738d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com    LOG(ERROR) << "Could not parse binary data.";
174b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    return false;
175b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  }
176b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  return true;
177de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com#else
178de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com  return false;
179de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com#endif
180b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com}
181de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com
1826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}  // namespace
1836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
1846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comclass PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
1856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com private:
186fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com  friend class Singleton<PhoneNumberMatcherRegExps>;
187fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com
1886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string opening_parens_;
1896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string closing_parens_;
1906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string non_parens_;
1916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Limit on the number of pairs of brackets in a phone number.
1926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string bracket_pair_limit_;
1936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Helper strings for the matching_brackets_ pattern.
1946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // An opening bracket at the beginning may not be closed, but subsequent ones
1956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // should be. It's also possible that the leading bracket was dropped, so we
1966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // shouldn't be surprised if we see a closing bracket first.
1976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string leading_maybe_matched_bracket_;
1986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string bracket_pairs_;
1996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Limit on the number of leading (plus) characters.
2006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string lead_limit_;
2016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Limit on the number of consecutive punctuation characters.
2026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string punctuation_limit_;
2036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // The maximum number of digits allowed in a digit-separated block. As we
2046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // allow all digits in a single block, this should be set high enough to
2056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // accommodate the entire national number and the international country code.
2066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  int digit_block_limit_;
2076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Limit on the number of blocks separated by punctuation. Uses
2086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // kDigitBlockLimit since some formats use spaces to separate each digit.
2096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string block_limit_;
2106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // A punctuation sequence allowing white space.
2116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string punctuation_;
2126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // A digits block without punctuation.
2136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string digit_sequence_;
2146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Punctuation that may be at the start of a phone number - brackets and plus
2156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // signs.
2166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string lead_class_chars_;
2176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Same as lead_class_chars_, but enclosed as a character class.
2186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string lead_class_;
2196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Extra helper strings that form part of pattern_. These are stored
2206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // separately since StrCat has a limit of 12 args.
2216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string opening_punctuation_;
2226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string optional_extn_pattern_;
2236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
2246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com public:
2256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // We use two different reg-ex factories here for performance reasons. RE2 is
2266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // much faster for smaller reg-ex patterns, but the main pattern cannot be
2276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // handled by RE2 in an efficient way.
2286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
2296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
2306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
2316a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Matches strings that look like publication pages. Example:
2326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Computing Complete Answers to Queries in the Presence of Limited Access
2336a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
2346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  //
2356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // The string "211-227 (2003)" is not a telephone number.
2366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> pub_pages_;
2376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Matches strings that look like dates using "/" as a separator. Examples:
2386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // 3/10/2011, 31/10/96 or 08/31/95.
2396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> slash_separated_dates_;
240cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
241cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
242cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  scoped_ptr<const RegExp> time_stamps_;
243cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  scoped_ptr<const RegExp> time_stamps_suffix_;
2446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Pattern to check that brackets match. Opening brackets should be closed
2456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // within a phone number. This also checks that there is something inside the
2466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // brackets. Having no brackets at all is also fine.
2476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> matching_brackets_;
2486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Matches white-space, which may indicate the end of a phone number and the
2496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // start of something else (such as a neighbouring zip-code). If white-space
2506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // is found, continues to match all characters that are not typically used to
2516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // start a phone number.
2526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> group_separator_;
2536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
2546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
2556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Compiled reg-ex representing lead_class_;
2566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> lead_class_pattern_;
2576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Phone number pattern allowing optional punctuation.
2586a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<const RegExp> pattern_;
2596a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
2606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  PhoneNumberMatcherRegExps()
2616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
2626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
2636a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
2646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        bracket_pair_limit_(Limit(0, 3)),
2656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        leading_maybe_matched_bracket_(StrCat(
2666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "(?:[", opening_parens_, "])?",
2676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "(?:", non_parens_, "+[", closing_parens_, "])?")),
2686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        bracket_pairs_(StrCat(
2696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "(?:[", opening_parens_, "]", non_parens_, "+",
2706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "[", closing_parens_, "])", bracket_pair_limit_)),
2716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        lead_limit_(Limit(0, 2)),
2726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        punctuation_limit_(Limit(0, 4)),
2736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
2746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                           PhoneNumberUtil::kMaxLengthCountryCode),
2756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        block_limit_(Limit(0, digit_block_limit_)),
2766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
2776a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                            punctuation_limit_)),
2786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
2796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
2806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        lead_class_(StrCat("[", lead_class_chars_, "]")),
2816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
2826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        optional_extn_pattern_(StrCat(
2836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "(?i)(?:",
2846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
2856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            ")?")),
2866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        regexp_factory_for_pattern_(new ICURegExpFactory()),
287fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#ifdef I18N_PHONENUMBERS_USE_RE2
2886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        regexp_factory_(new RE2RegExpFactory()),
2896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#else
2906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        regexp_factory_(new ICURegExpFactory()),
291fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#endif  // I18N_PHONENUMBERS_USE_RE2
2926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        pub_pages_(regexp_factory_->CreateRegExp(
2936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
2946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        slash_separated_dates_(regexp_factory_->CreateRegExp(
2956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "(?:(?:[0-3]?\\d/[01]?\\d)|"
2966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
297cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com        time_stamps_(regexp_factory_->CreateRegExp(
298cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com            "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")),
299cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com        time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
3006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        matching_brackets_(regexp_factory_->CreateRegExp(
3016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
3026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                   bracket_pairs_, non_parens_, "*"))),
3036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        group_separator_(regexp_factory_->CreateRegExp(
3046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
3056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        capture_up_to_second_number_start_pattern_(
3066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            regexp_factory_->CreateRegExp(
3076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
3086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        capturing_ascii_digits_pattern_(
3096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            regexp_factory_->CreateRegExp("(\\d+)")),
3106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
3116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        pattern_(regexp_factory_for_pattern_->CreateRegExp(
3126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com            StrCat("(", opening_punctuation_, lead_limit_,
3136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                   digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
3146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                   block_limit_, optional_extn_pattern_, ")"))) {
3156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
3166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
3176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com private:
3186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
3196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com};
3206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
321b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.comclass AlternateFormats : public Singleton<AlternateFormats> {
322b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com public:
323b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  PhoneMetadataCollection format_data_;
324b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com
325b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
326b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com
327b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  AlternateFormats()
328b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      : format_data_(),
329b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com        calling_code_to_alternate_formats_map_() {
330b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    if (!LoadAlternateFormats(&format_data_)) {
331b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      LOG(DFATAL) << "Could not parse compiled-in metadata.";
332b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      return;
333b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    }
334b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
335b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com             format_data_.metadata().begin();
336b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com         it != format_data_.metadata().end();
337b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com         ++it) {
338b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      calling_code_to_alternate_formats_map_.insert(
339b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com          make_pair(it->country_code(), &*it));
340b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    }
341b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  }
342b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com
343b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
344b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      const {
345b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    map<int, const PhoneMetadata*>::const_iterator it =
346b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com        calling_code_to_alternate_formats_map_.find(country_calling_code);
347b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    if (it != calling_code_to_alternate_formats_map_.end()) {
348b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      return it->second;
349b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    }
350b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    return NULL;
351b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  }
352b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com
353b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com private:
354b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
355b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com};
356b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com
3576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comPhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
3586a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                       const string& text,
3596a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                       const string& region_code,
3606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                       PhoneNumberMatcher::Leniency leniency,
3616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                       int max_tries)
3626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
363b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      alternate_formats_(AlternateFormats::GetInstance()),
3646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      phone_util_(util),
3656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      text_(text),
3666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      preferred_region_(region_code),
3676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      leniency_(leniency),
3686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      max_tries_(max_tries),
3696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      state_(NOT_READY),
3706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      last_match_(NULL),
3716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      search_index_(0) {
3726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
3736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
3746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comPhoneNumberMatcher::PhoneNumberMatcher(const string& text,
3756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                       const string& region_code)
3766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
377b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      alternate_formats_(NULL),  // Not used.
3786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      phone_util_(*PhoneNumberUtil::GetInstance()),
3796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      text_(text),
3806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      preferred_region_(region_code),
3816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      leniency_(VALID),
3826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      max_tries_(numeric_limits<int>::max()),
3836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      state_(NOT_READY),
3846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      last_match_(NULL),
3856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      search_index_(0) {
3866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
3876a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
3886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comPhoneNumberMatcher::~PhoneNumberMatcher() {
3896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
3906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
3916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// static
3926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
3936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Combining marks are a subset of non-spacing-mark.
3946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
3956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return false;
3966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
3976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  UBlockCode block = ublock_getCode(letter);
3986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return ((block == UBLOCK_BASIC_LATIN) ||
3996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
4006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      (block == UBLOCK_LATIN_EXTENDED_A) ||
4016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
4026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      (block == UBLOCK_LATIN_EXTENDED_B) ||
4036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
4046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
4056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
4066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
4076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                        PhoneNumberMatch* match) {
4086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK(match);
4096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Check the candidate doesn't contain any formatting which would indicate
4106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // that it really isn't a phone number.
4116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
4126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return false;
4136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
4146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
4156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // If leniency is set to VALID or stricter, we also want to skip numbers that
4166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // are surrounded by Latin alphabetic characters, to skip cases like
4176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // abc8005001234 or 8005001234def.
4186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (leniency_ >= VALID) {
4196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    // If the candidate is not at the start of the text, and does not start with
4206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    // phone-number punctuation, check the previous character.
4216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    scoped_ptr<RegExpInput> candidate_input(
4226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        reg_exps_->regexp_factory_->CreateInput(candidate));
4236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (offset > 0 &&
4246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
4256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      char32 previous_char;
4266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      const char* previous_char_ptr =
4276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com          EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
4286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                                text_.c_str() + offset);
4296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
43086929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com      // We return false if it is a latin letter or an invalid punctuation
43186929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com      // symbol.
43286929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com      if (IsInvalidPunctuationSymbol(previous_char) ||
43386929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com          IsLatinLetter(previous_char)) {
4346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
4356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
4366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
4376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    size_t lastCharIndex = offset + candidate.length();
4386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (lastCharIndex < text_.length()) {
4396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      char32 next_char;
4406a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      const char* next_char_ptr =
4416a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com          EncodingUtils::AdvanceOneUTF8Character(
4426a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com              text_.c_str() + lastCharIndex - 1);
4436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
44486929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com      if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
4456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
4466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
4476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
4486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
4496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
4506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  PhoneNumber number;
4511fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
4526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      PhoneNumberUtil::NO_PARSING_ERROR) {
4536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return false;
4546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
4556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
4566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    match->set_start(offset);
4576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    match->set_raw_string(candidate);
4581fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // We used ParseAndKeepRawInput to create this number, but for now we don't
4591fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // return the extra values parsed. TODO: stop clearing all values here and
4601fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // switch all users over to using raw_input() rather than the raw_string()
4611fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // of PhoneNumberMatch.
4621fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    number.clear_country_code_source();
4631fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    number.clear_preferred_domestic_carrier_code();
4641fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    number.clear_raw_input();
4656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    match->set_number(number);
4666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return true;
4676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
4686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return false;
4696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
4706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
4716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Helper method to replace the verification method for each enum in the Java
4726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// version.
4736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::VerifyAccordingToLeniency(
4746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    Leniency leniency, const PhoneNumber& number,
4756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    const string& candidate) const {
4766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  switch (leniency) {
4776a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    case PhoneNumberMatcher::POSSIBLE:
4786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      return phone_util_.IsPossibleNumber(number);
4796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    case PhoneNumberMatcher::VALID:
4801fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      if (!phone_util_.IsValidNumber(number) ||
4811fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com          !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
4826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
4836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
4841fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      return IsNationalPrefixPresentIfRequired(number);
4856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    case PhoneNumberMatcher::STRICT_GROUPING: {
4866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      if (!phone_util_.IsValidNumber(number) ||
4876a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com          !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
4887e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com          ContainsMoreThanOneSlashInNationalNumber(
4897e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com              number, candidate, phone_util_) ||
4901fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com          !IsNationalPrefixPresentIfRequired(number)) {
4916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
4926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
493e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
494e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                      const string&, const vector<string>&>* callback =
495e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com          NewPermanentCallback(&AllNumberGroupsRemainGrouped);
496e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
497e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      delete(callback);
498e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      return is_valid;
4996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
5006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    case PhoneNumberMatcher::EXACT_GROUPING: {
5016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      if (!phone_util_.IsValidNumber(number) ||
5026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com          !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
5037e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com          ContainsMoreThanOneSlashInNationalNumber(
5047e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com              number, candidate, phone_util_) ||
5051fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com          !IsNationalPrefixPresentIfRequired(number)) {
5066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
5076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
508e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
509e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                      const string&, const vector<string>&>* callback =
510e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com          NewPermanentCallback(
511e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com              this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
512e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
513e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      delete(callback);
514e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      return is_valid;
5156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
5166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    default:
5176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      LOG(ERROR) << "No implementation defined for verification for leniency "
5186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                 << static_cast<int>(leniency);
5196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      return false;
5206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
5216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
5226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
5236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
5246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                           PhoneNumberMatch* match) {
5256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK(match);
5266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Try removing either the first or last "group" in the number and see if this
5276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // gives a result. We consider white space to be a possible indication of
5286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // the start or end of the phone number.
5296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<RegExpInput> candidate_input(
5306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      reg_exps_->regexp_factory_->CreateInput(candidate));
5316a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
5326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                                  NULL)) {
5336a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    // Try the first group by itself.
5346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    int group_start_index =
5356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        candidate.length() - candidate_input->ToString().length();
5366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    string first_group_only = candidate.substr(0, group_start_index);
5376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    phone_util_.TrimUnwantedEndChars(&first_group_only);
5386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    bool success = ParseAndVerify(first_group_only, offset, match);
5396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (success) {
5406a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      return true;
5416a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
5426a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    --max_tries_;
5436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
5446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    // Try the rest of the candidate without the first group.
5456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    string without_first_group(candidate_input->ToString());
5466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    phone_util_.TrimUnwantedEndChars(&without_first_group);
5476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    success =
5486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        ParseAndVerify(without_first_group, offset + group_start_index, match);
5496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (success) {
5506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      return true;
5516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
5526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    --max_tries_;
5536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
5546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (max_tries_ > 0) {
5556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
5566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                                         NULL)) {
5576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        // Find the last group.
5586a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
5596a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      int last_group_start =
5606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com          candidate.length() - candidate_input->ToString().length();
5616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      string without_last_group = candidate.substr(0, last_group_start);
5626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      phone_util_.TrimUnwantedEndChars(&without_last_group);
5636a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      if (without_last_group == first_group_only) {
5646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        // If there are only two groups, then the group "without the last group"
5656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        // is the same as the first group. In these cases, we don't want to
5666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        // re-check the number group, so we exit already.
5676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return false;
5686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
5696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      success = ParseAndVerify(without_last_group, offset, match);
5706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      if (success) {
5716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        return true;
5726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      }
5736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      --max_tries_;
5746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
5756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
5766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return false;
5776a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
5786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
5796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
5806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                      PhoneNumberMatch* match) {
5816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK(match);
5826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Skip a match that is more likely a publication page reference or a date.
5836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
5846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
5856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return false;
5866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
587cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  // Skip potential time-stamps.
588cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
589cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com    scoped_ptr<RegExpInput> following_text(
590cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com        reg_exps_->regexp_factory_->CreateInput(
591cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com            text_.substr(offset + candidate.size())));
592cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com    if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
593cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com      return false;
594cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com    }
595cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com  }
5966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
5976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Try to come up with a valid match given the entire candidate.
5986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (ParseAndVerify(candidate, offset, match)) {
5996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return true;
6006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
6016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
6026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // If that failed, try to find an "inner match" - there might be a phone
6036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // number within this candidate.
6046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return ExtractInnerMatch(candidate, offset, match);
6056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
6066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
6076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::HasNext() {
6086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (state_ == NOT_READY) {
6096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    PhoneNumberMatch temp_match;
6106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (!Find(search_index_, &temp_match)) {
6116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      state_ = DONE;
6126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    } else {
6136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      last_match_.reset(new PhoneNumberMatch(temp_match.start(),
6146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                             temp_match.raw_string(),
6156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com                                             temp_match.number()));
6166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      search_index_ = last_match_->end();
6176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      state_ = READY;
6186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
6196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
6206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return state_ == READY;
6216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
6226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
6236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
6246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK(match);
6256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  // Check the state and find the next match as a side-effect if necessary.
6266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  if (!HasNext()) {
6276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    return false;
6286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
6296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  match->CopyFrom(*last_match_);
6306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  state_ = NOT_READY;
6316a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  last_match_.reset(NULL);
6326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return true;
6336a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
6346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
6356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
6366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  DCHECK(match);
6376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
6386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  scoped_ptr<RegExpInput> text(
6396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
6406a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  string candidate;
6416a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  while ((max_tries_ > 0) &&
6426a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com         reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
6436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    int start = text_.length() - text->ToString().length() - candidate.length();
6446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    // Check for extra numbers at the end.
6456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    reg_exps_->capture_up_to_second_number_start_pattern_->
6466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com        PartialMatch(candidate, &candidate);
6476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    if (ExtractMatch(candidate, start, match)) {
6486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com      return true;
6496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    }
6506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
6516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    index = start + candidate.length();
6526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com    --max_tries_;
6536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  }
6546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com  return false;
6556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}
6566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com
657e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.combool PhoneNumberMatcher::CheckNumberGroupingIsValid(
658e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const PhoneNumber& phone_number,
659e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const string& candidate,
660e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
661e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                    const string&, const vector<string>&>* checker) const {
662e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  DCHECK(checker);
663e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // TODO: Evaluate how this works for other locales (testing has been limited
664e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // to NANPA regions) and optimise if necessary.
665e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  string normalized_candidate =
666e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      NormalizeUTF8::NormalizeDecimalDigits(candidate);
667e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  vector<string> formatted_number_groups;
668e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  GetNationalNumberGroups(phone_number, NULL,  // Use default formatting pattern
669e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                          &formatted_number_groups);
670e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  if (checker->Run(phone_util_, phone_number, normalized_candidate,
671e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                   formatted_number_groups)) {
672e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    return true;
673e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  }
674b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  // If this didn't pass, see if there are any alternate formats, and try them
675b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  // instead.
676b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  const PhoneMetadata* alternate_formats =
677b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    alternate_formats_->GetAlternateFormatsForCountry(
678b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com        phone_number.country_code());
679b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  if (alternate_formats) {
680b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    for (RepeatedPtrField<NumberFormat>::const_iterator it =
681b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com             alternate_formats->number_format().begin();
682b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com         it != alternate_formats->number_format().end(); ++it) {
683b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      formatted_number_groups.clear();
684b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
685b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      if (checker->Run(phone_util_, phone_number, normalized_candidate,
686b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com                       formatted_number_groups)) {
687b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com        return true;
688b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com      }
689b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com    }
690b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com  }
691e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  return false;
692e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com}
693e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com
694e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com// Helper method to get the national-number part of a number, formatted without
695e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com// any national prefix, and return it as a set of digit blocks that would be
696e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com// formatted together.
697e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.comvoid PhoneNumberMatcher::GetNationalNumberGroups(
698e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const PhoneNumber& number,
699e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const NumberFormat* formatting_pattern,
700e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    vector<string>* digit_blocks) const {
701e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  string rfc3966_format;
702e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  if (!formatting_pattern) {
703e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // This will be in the format +CC-DG;ext=EXT where DG represents groups of
704e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // digits.
705e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
706e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // We remove the extension part from the formatted string before splitting
707e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // it into different groups.
708e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    size_t end_index = rfc3966_format.find(';');
709e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    if (end_index == string::npos) {
710e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      end_index = rfc3966_format.length();
711e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    }
712e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // The country-code will have a '-' following it.
713e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    size_t start_index = rfc3966_format.find('-') + 1;
714e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    SplitStringUsing(rfc3966_format.substr(start_index,
715e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                           end_index - start_index),
716e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                     "-", digit_blocks);
717e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  } else {
718e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    // We format the NSN only, and split that according to the separator.
719e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    string national_significant_number;
720e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    phone_util_.GetNationalSignificantNumber(number,
721e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                             &national_significant_number);
722e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    phone_util_.FormatNsnUsingPattern(national_significant_number,
723e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                      *formatting_pattern,
724e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                      PhoneNumberUtil::RFC3966,
725e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                      &rfc3966_format);
726e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    SplitStringUsing(rfc3966_format, "-", digit_blocks);
727e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  }
728e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com}
729e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com
7301fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.combool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
7311fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    const PhoneNumber& number) const {
7321fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  // First, check how we deduced the country code. If it was written in
7331fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  // international format, then the national prefix is not required.
7341fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
7351fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    return true;
7361fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  }
7371fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  string phone_number_region;
7381fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  phone_util_.GetRegionCodeForCountryCode(
7391fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      number.country_code(), &phone_number_region);
7401fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  const PhoneMetadata* metadata =
7411fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      phone_util_.GetMetadataForRegion(phone_number_region);
7421fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  if (!metadata) {
7431fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    return true;
7441fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  }
7451fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  // Check if a national prefix should be present when formatting this number.
7461fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  string national_number;
7471fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  phone_util_.GetNationalSignificantNumber(number, &national_number);
7481fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  const NumberFormat* format_rule =
7491fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
7501fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com                                                   national_number);
7511fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  // To do this, we check that a national prefix formatting rule was present and
7521fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  // that it wasn't just the first-group symbol ($1) with punctuation.
7531fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
7541fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    if (format_rule->national_prefix_optional_when_formatting()) {
7551fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      // The national-prefix is optional in these cases, so we don't need to
7561fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      // check if it was present.
7571fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      return true;
7581fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    }
75935bd393fb78215a9c6dbeb158913def01eb58985lararennie@google.com    if (phone_util_.FormattingRuleHasFirstGroupOnly(
76035bd393fb78215a9c6dbeb158913def01eb58985lararennie@google.com        format_rule->national_prefix_formatting_rule())) {
7611fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      // National Prefix not needed for this number.
7621fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com      return true;
7631fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    }
7641fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // Normalize the remainder.
7651fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    string raw_input_copy(number.raw_input());
7661fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // Check if we found a national prefix and/or carrier code at the start of
7671fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    // the raw input, and return the result.
7681fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    phone_util_.NormalizeDigitsOnly(&raw_input_copy);
7691fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com    return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
7701fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com        *metadata,
7711fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com        &raw_input_copy,
7721fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com        NULL);  // Don't need to keep the stripped carrier code.
7731fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  }
7741fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com  return true;
7751fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com}
7761fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com
777e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.combool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
778e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const PhoneNumberUtil& util,
779e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const PhoneNumber& phone_number,
780e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const string& normalized_candidate,
781e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const vector<string>& formatted_number_groups) const {
782e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    const scoped_ptr<RegExpInput> candidate_number(
783e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
784e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  vector<string> candidate_groups;
785e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  string digit_block;
786e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
787e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com             candidate_number.get(),
788e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com             &digit_block)) {
789e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    candidate_groups.push_back(digit_block);
790e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  }
791e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com
792e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // Set this to the last group, skipping it if the number has an extension.
793e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  int candidate_number_group_index =
794e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      phone_number.has_extension() ? candidate_groups.size() - 2
795e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                   : candidate_groups.size() - 1;
796e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // First we check if the national significant number is formatted as a block.
797e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // We use find and not equals, since the national significant number may be
798e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // present with a prefix such as a national number prefix, or the country code
799e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // itself.
800e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  string national_significant_number;
801e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  util.GetNationalSignificantNumber(phone_number,
802e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                                    &national_significant_number);
803e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  if (candidate_groups.size() == 1 ||
804e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      candidate_groups.at(candidate_number_group_index).find(
805e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com          national_significant_number) != string::npos) {
806e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    return true;
807e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  }
808e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // Starting from the end, go through in reverse, excluding the first group,
809e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // and check the candidate and number groups are the same.
810e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  for (int formatted_number_group_index =
811e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com           (formatted_number_groups.size() - 1);
812e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com       formatted_number_group_index > 0 &&
813e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com       candidate_number_group_index >= 0;
814e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com       --formatted_number_group_index, --candidate_number_group_index) {
815e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    if (candidate_groups.at(candidate_number_group_index) !=
816e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com        formatted_number_groups.at(formatted_number_group_index)) {
817e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com      return false;
818e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com    }
819e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  }
820e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // Now check the first group. There may be a national prefix at the start, so
821e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // we only check that the candidate group ends with the formatted number
822e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  // group.
823e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com  return (candidate_number_group_index >= 0 &&
824e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com          HasSuffixString(candidate_groups.at(candidate_number_group_index),
825e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com                          formatted_number_groups.at(0)));
826e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com}
827e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com
8287e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com// static
8297e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.combool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
8307e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    const PhoneNumber& number,
8317e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    const string& candidate,
8327e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    const PhoneNumberUtil& util) {
8337e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  size_t first_slash_in_body = candidate.find('/');
8347e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  if (first_slash_in_body == string::npos) {
8357e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    // No slashes, this is okay.
8367e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    return false;
8377e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  }
8387e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  // Now look for a second one.
8397e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
8407e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  if (second_slash_in_body == string::npos) {
8417e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    // Only one slash, this is okay.
8427e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    return false;
8437e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  }
8447e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com
8457e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  // If the first slash is after the country calling code, this is permitted.
8467e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
8477e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com      number.country_code_source() ==
8487e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com          PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
8497e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    string normalized_country_code =
8507e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com        candidate.substr(0, first_slash_in_body);
8517e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    util.NormalizeDigitsOnly(&normalized_country_code);
8527e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    if (normalized_country_code == SimpleItoa(number.country_code())) {
8537e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com      // Any more slashes and this is illegal.
8547e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com      return candidate.find('/', second_slash_in_body + 1) != string::npos;
8557e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com    }
8567e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  }
8577e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com  return true;
8587e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com}
8597e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com
8606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}  // namespace phonenumbers
8616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}  // namespace i18n
862