16a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Copyright (C) 2011 The Libphonenumber Authors 26a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// 36a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Licensed under the Apache License, Version 2.0 (the "License"); 46a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// you may not use this file except in compliance with the License. 56a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// You may obtain a copy of the License at 66a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// 76a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// http://www.apache.org/licenses/LICENSE-2.0 86a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// 96a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Unless required by applicable law or agreed to in writing, software 106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// distributed under the License is distributed on an "AS IS" BASIS, 116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// See the License for the specific language governing permissions and 136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// limitations under the License. 146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// 156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Author: Lara Rennie 166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Author: Tao Huang 176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// 186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Implementation of a stateful class that finds and extracts telephone numbers 196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// from text. 206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumbermatcher.h" 226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 23fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP 24fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#error phonenumbermatcher depends on ICU \ 25fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set) 26fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#endif // I18N_PHONENUMBERS_USE_ICU_REGEXP 276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 28603e7e5f83aad6e45e8d794c604a546936b77a16philip.liard@gmail.com#include <ctype.h> 297e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com#include <stddef.h> 306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include <limits> 31af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include <map> 326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include <string> 3335bd393fb78215a9c6dbeb158913def01eb58985lararennie@google.com#include <utility> 34603e7e5f83aad6e45e8d794c604a546936b77a16philip.liard@gmail.com#include <vector> 35603e7e5f83aad6e45e8d794c604a546936b77a16philip.liard@gmail.com 366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include <unicode/uchar.h> 376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 38b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com#include "phonenumbers/alternate_format.h" 39af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/logging.h" 40af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/memory/scoped_ptr.h" 41af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/memory/singleton.h" 42e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com#include "phonenumbers/callback.h" 436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/default_logger.h" 446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/encoding_utils.h" 456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/normalize_utf8.h" 461fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com#include "phonenumbers/phonemetadata.pb.h" 476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumber.pb.h" 486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumbermatch.h" 496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/phonenumberutil.h" 506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/regexp_adapter.h" 516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/regexp_adapter_icu.h" 526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/stringutil.h" 536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 54fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#ifdef I18N_PHONENUMBERS_USE_RE2 556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#include "phonenumbers/regexp_adapter_re2.h" 56fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU 576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 58b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.comusing std::make_pair; 59b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.comusing std::map; 606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comusing std::numeric_limits; 616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comusing std::string; 626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comusing std::vector; 636a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comnamespace i18n { 656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comnamespace phonenumbers { 666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comnamespace { 686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Returns a regular expression quantifier with an upper and lower limit. 696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comstring Limit(int lower, int upper) { 706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK_GE(lower, 0); 716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK_GT(upper, 0); 726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK_LT(lower, upper); 736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return StrCat("{", lower, ",", upper, "}"); 746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 7686929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.combool IsInvalidPunctuationSymbol(char32 character) { 7786929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL; 786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate, 816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const PhoneNumberUtil& util) { 826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // The characters 'x' and 'X' can be (1) a carrier code, in which case they 836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // always precede the national significant number or (2) an extension sign, 846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // in which case they always precede the extension number. We assume a 856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // carrier code is more than 1 digit, so the first case has to have more than 866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1 876a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // 'x' or 'X'. 886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com size_t found; 896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com found = candidate.find_first_of("xX"); 906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // We ignore the character if 'x' or 'X' appears as the last character of 916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // the string. 926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com while (found != string::npos && found < candidate.length() - 1) { 936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // We only look for 'x' or 'X' in ASCII form. 946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com char next_char = candidate[found + 1]; 956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (next_char == 'x' || next_char == 'X') { 966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // This is the carrier code case, in which the 'X's always precede the 976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // national significant number. 986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com ++found; 996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (util.IsNumberMatchWithOneString( 1006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com number, candidate.substr(found, candidate.length() - found)) 1016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com != PhoneNumberUtil::NSN_MATCH) { 1026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 1036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 1046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } else { 1056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string normalized_extension(candidate.substr(found, 1066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com candidate.length() - found)); 1076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com util.NormalizeDigitsOnly(&normalized_extension); 1086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (normalized_extension != number.extension()) { 1096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 1106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 1116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 1126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com found = candidate.find_first_of("xX", found + 1); 1136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 1146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 1156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 116e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com 117e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.combool AllNumberGroupsRemainGrouped( 118e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const PhoneNumberUtil& util, 119e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const PhoneNumber& phone_number, 120e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const string& normalized_candidate, 121e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const vector<string>& formatted_number_groups) { 122e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com size_t from_index = 0; 123e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // Check each group of consecutive digits are not broken into separate 124e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // groupings in the normalized_candidate string. 125e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com for (size_t i = 0; i < formatted_number_groups.size(); ++i) { 126e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // Fails if the substring of normalized_candidate starting from from_index 127e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // doesn't contain the consecutive digits in formatted_number_groups.at(i). 128e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com from_index = normalized_candidate.find(formatted_number_groups.at(i), 129e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com from_index); 130e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (from_index == string::npos) { 131e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return false; 132e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 133e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // Moves from_index forward. 134e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com from_index += formatted_number_groups.at(i).length(); 135e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (i == 0 && from_index < normalized_candidate.length()) { 136b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // We are at the position right after the NDC. We get the region used for 137b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // formatting information based on the country code in the phone number, 138b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // rather than the number itself, as we do not need to distinguish between 139b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // different countries with the same country calling code and this is 140b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // faster. 141b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com string region; 142b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com util.GetRegionCodeForCountryCode(phone_number.country_code(), ®ion); 143b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com string ndd_prefix; 144b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com util.GetNddPrefixForRegion(region, true, &ndd_prefix); 145b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // Note although normalized_candidate might contain non-ASCII formatting 146b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // characters, they won't be treated as ASCII digits when converted to a 147b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // char. 148b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) { 149e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // This means there is no formatting symbol after the NDC. In this case, 150e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // we only accept the number if there is no formatting symbol at all in 151b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // the number, except for extensions. This is only important for 152b3bfbbcb458043ddaaa1099b776014ba0968704dlararennie@google.com // countries with national prefixes. 153e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com string national_significant_number; 154e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com util.GetNationalSignificantNumber( 155e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com phone_number, &national_significant_number); 156e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return HasPrefixString(normalized_candidate.substr( 157e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com from_index - formatted_number_groups.at(i).length()), 158e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com national_significant_number); 159e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 160e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 161e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 162e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // The check here makes sure that we haven't mistakenly already used the 163e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // extension to match the last group of the subscriber number. Note the 164e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // extension cannot have formatting in-between digits. 165e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return normalized_candidate.substr(from_index) 166e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com .find(phone_number.extension()) != string::npos; 167e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com} 168b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com 169b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.combool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) { 170de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com#if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS) 171b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com if (!alternate_formats->ParseFromArray(alternate_format_get(), 172b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com alternate_format_size())) { 1738d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com LOG(ERROR) << "Could not parse binary data."; 174b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com return false; 175b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 176b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com return true; 177de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com#else 178de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com return false; 179de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com#endif 180b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com} 181de41ee57fc42757aaa958909a2cb1ff6af5d9356philip.liard@gmail.com 1826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} // namespace 1836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 1846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comclass PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> { 1856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com private: 186fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com friend class Singleton<PhoneNumberMatcherRegExps>; 187fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com 1886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string opening_parens_; 1896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string closing_parens_; 1906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string non_parens_; 1916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Limit on the number of pairs of brackets in a phone number. 1926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string bracket_pair_limit_; 1936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Helper strings for the matching_brackets_ pattern. 1946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // An opening bracket at the beginning may not be closed, but subsequent ones 1956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // should be. It's also possible that the leading bracket was dropped, so we 1966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // shouldn't be surprised if we see a closing bracket first. 1976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string leading_maybe_matched_bracket_; 1986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string bracket_pairs_; 1996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Limit on the number of leading (plus) characters. 2006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string lead_limit_; 2016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Limit on the number of consecutive punctuation characters. 2026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string punctuation_limit_; 2036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // The maximum number of digits allowed in a digit-separated block. As we 2046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // allow all digits in a single block, this should be set high enough to 2056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // accommodate the entire national number and the international country code. 2066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com int digit_block_limit_; 2076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Limit on the number of blocks separated by punctuation. Uses 2086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // kDigitBlockLimit since some formats use spaces to separate each digit. 2096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string block_limit_; 2106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // A punctuation sequence allowing white space. 2116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string punctuation_; 2126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // A digits block without punctuation. 2136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string digit_sequence_; 2146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Punctuation that may be at the start of a phone number - brackets and plus 2156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // signs. 2166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string lead_class_chars_; 2176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Same as lead_class_chars_, but enclosed as a character class. 2186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string lead_class_; 2196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Extra helper strings that form part of pattern_. These are stored 2206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // separately since StrCat has a limit of 12 args. 2216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string opening_punctuation_; 2226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string optional_extn_pattern_; 2236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 2246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com public: 2256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // We use two different reg-ex factories here for performance reasons. RE2 is 2266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // much faster for smaller reg-ex patterns, but the main pattern cannot be 2276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // handled by RE2 in an efficient way. 2286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_; 2296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const AbstractRegExpFactory> regexp_factory_; 2306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 2316a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Matches strings that look like publication pages. Example: 2326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Computing Complete Answers to Queries in the Presence of Limited Access 2336a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003). 2346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // 2356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // The string "211-227 (2003)" is not a telephone number. 2366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> pub_pages_; 2376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Matches strings that look like dates using "/" as a separator. Examples: 2386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // 3/10/2011, 31/10/96 or 08/31/95. 2396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> slash_separated_dates_; 240cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does 241cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_. 242cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com scoped_ptr<const RegExp> time_stamps_; 243cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com scoped_ptr<const RegExp> time_stamps_suffix_; 2446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Pattern to check that brackets match. Opening brackets should be closed 2456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // within a phone number. This also checks that there is something inside the 2466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // brackets. Having no brackets at all is also fine. 2476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> matching_brackets_; 2486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Matches white-space, which may indicate the end of a phone number and the 2496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // start of something else (such as a neighbouring zip-code). If white-space 2506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // is found, continues to match all characters that are not typically used to 2516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // start a phone number. 2526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> group_separator_; 2536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_; 2546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> capturing_ascii_digits_pattern_; 2556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Compiled reg-ex representing lead_class_; 2566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> lead_class_pattern_; 2576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Phone number pattern allowing optional punctuation. 2586a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<const RegExp> pattern_; 2596a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 2606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberMatcherRegExps() 2616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */), 2626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */), 2636a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")), 2646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com bracket_pair_limit_(Limit(0, 3)), 2656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com leading_maybe_matched_bracket_(StrCat( 2666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "(?:[", opening_parens_, "])?", 2676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "(?:", non_parens_, "+[", closing_parens_, "])?")), 2686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com bracket_pairs_(StrCat( 2696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "(?:[", opening_parens_, "]", non_parens_, "+", 2706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "[", closing_parens_, "])", bracket_pair_limit_)), 2716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com lead_limit_(Limit(0, 2)), 2726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com punctuation_limit_(Limit(0, 4)), 2736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn + 2746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberUtil::kMaxLengthCountryCode), 2756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com block_limit_(Limit(0, digit_block_limit_)), 2766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]", 2776a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com punctuation_limit_)), 2786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))), 2796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)), 2806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com lead_class_(StrCat("[", lead_class_chars_, "]")), 2816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")), 2826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com optional_extn_pattern_(StrCat( 2836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "(?i)(?:", 2846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), 2856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com ")?")), 2866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com regexp_factory_for_pattern_(new ICURegExpFactory()), 287fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#ifdef I18N_PHONENUMBERS_USE_RE2 2886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com regexp_factory_(new RE2RegExpFactory()), 2896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com#else 2906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com regexp_factory_(new ICURegExpFactory()), 291fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com#endif // I18N_PHONENUMBERS_USE_RE2 2926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com pub_pages_(regexp_factory_->CreateRegExp( 2936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")), 2946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com slash_separated_dates_(regexp_factory_->CreateRegExp( 2956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "(?:(?:[0-3]?\\d/[01]?\\d)|" 2966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")), 297cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com time_stamps_(regexp_factory_->CreateRegExp( 298cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")), 299cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")), 3006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com matching_brackets_(regexp_factory_->CreateRegExp( 3016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com StrCat(leading_maybe_matched_bracket_, non_parens_, "+", 3026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com bracket_pairs_, non_parens_, "*"))), 3036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com group_separator_(regexp_factory_->CreateRegExp( 3046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))), 3056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com capture_up_to_second_number_start_pattern_( 3066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com regexp_factory_->CreateRegExp( 3076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberUtil::kCaptureUpToSecondNumberStart)), 3086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com capturing_ascii_digits_pattern_( 3096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com regexp_factory_->CreateRegExp("(\\d+)")), 3106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)), 3116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com pattern_(regexp_factory_for_pattern_->CreateRegExp( 3126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com StrCat("(", opening_punctuation_, lead_limit_, 3136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", 3146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com block_limit_, optional_extn_pattern_, ")"))) { 3156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 3166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 3176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com private: 3186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps); 3196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com}; 3206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 321b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.comclass AlternateFormats : public Singleton<AlternateFormats> { 322b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com public: 323b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com PhoneMetadataCollection format_data_; 324b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com 325b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_; 326b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com 327b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com AlternateFormats() 328b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com : format_data_(), 329b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com calling_code_to_alternate_formats_map_() { 330b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com if (!LoadAlternateFormats(&format_data_)) { 331b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com LOG(DFATAL) << "Could not parse compiled-in metadata."; 332b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com return; 333b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 334b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com for (RepeatedPtrField<PhoneMetadata>::const_iterator it = 335b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com format_data_.metadata().begin(); 336b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com it != format_data_.metadata().end(); 337b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com ++it) { 338b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com calling_code_to_alternate_formats_map_.insert( 339b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com make_pair(it->country_code(), &*it)); 340b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 341b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 342b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com 343b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code) 344b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com const { 345b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com map<int, const PhoneMetadata*>::const_iterator it = 346b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com calling_code_to_alternate_formats_map_.find(country_calling_code); 347b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com if (it != calling_code_to_alternate_formats_map_.end()) { 348b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com return it->second; 349b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 350b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com return NULL; 351b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 352b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com 353b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com private: 354b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com DISALLOW_COPY_AND_ASSIGN(AlternateFormats); 355b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com}; 356b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com 3576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comPhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util, 3586a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const string& text, 3596a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const string& region_code, 3606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberMatcher::Leniency leniency, 3616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com int max_tries) 3626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), 363b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com alternate_formats_(AlternateFormats::GetInstance()), 3646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com phone_util_(util), 3656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com text_(text), 3666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com preferred_region_(region_code), 3676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com leniency_(leniency), 3686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com max_tries_(max_tries), 3696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com state_(NOT_READY), 3706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com last_match_(NULL), 3716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com search_index_(0) { 3726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 3736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 3746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comPhoneNumberMatcher::PhoneNumberMatcher(const string& text, 3756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const string& region_code) 3766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), 377b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com alternate_formats_(NULL), // Not used. 3786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com phone_util_(*PhoneNumberUtil::GetInstance()), 3796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com text_(text), 3806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com preferred_region_(region_code), 3816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com leniency_(VALID), 3826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com max_tries_(numeric_limits<int>::max()), 3836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com state_(NOT_READY), 3846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com last_match_(NULL), 3856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com search_index_(0) { 3866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 3876a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 3886a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.comPhoneNumberMatcher::~PhoneNumberMatcher() { 3896a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 3906a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 3916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// static 3926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::IsLatinLetter(char32 letter) { 3936a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Combining marks are a subset of non-spacing-mark. 3946a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) { 3956a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 3966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 3976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com UBlockCode block = ublock_getCode(letter); 3986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return ((block == UBLOCK_BASIC_LATIN) || 3996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com (block == UBLOCK_LATIN_1_SUPPLEMENT) || 4006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com (block == UBLOCK_LATIN_EXTENDED_A) || 4016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) || 4026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com (block == UBLOCK_LATIN_EXTENDED_B) || 4036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com (block == UBLOCK_COMBINING_DIACRITICAL_MARKS)); 4046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 4056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 4066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset, 4076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberMatch* match) { 4086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK(match); 4096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Check the candidate doesn't contain any formatting which would indicate 4106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // that it really isn't a phone number. 4116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (!reg_exps_->matching_brackets_->FullMatch(candidate)) { 4126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 4156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // If leniency is set to VALID or stricter, we also want to skip numbers that 4166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // are surrounded by Latin alphabetic characters, to skip cases like 4176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // abc8005001234 or 8005001234def. 4186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (leniency_ >= VALID) { 4196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // If the candidate is not at the start of the text, and does not start with 4206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // phone-number punctuation, check the previous character. 4216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<RegExpInput> candidate_input( 4226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com reg_exps_->regexp_factory_->CreateInput(candidate)); 4236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (offset > 0 && 4246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) { 4256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com char32 previous_char; 4266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const char* previous_char_ptr = 4276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com EncodingUtils::BackUpOneUTF8Character(text_.c_str(), 4286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com text_.c_str() + offset); 4296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char); 43086929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com // We return false if it is a latin letter or an invalid punctuation 43186929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com // symbol. 43286929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com if (IsInvalidPunctuationSymbol(previous_char) || 43386929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com IsLatinLetter(previous_char)) { 4346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com size_t lastCharIndex = offset + candidate.length(); 4386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (lastCharIndex < text_.length()) { 4396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com char32 next_char; 4406a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const char* next_char_ptr = 4416a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com EncodingUtils::AdvanceOneUTF8Character( 4426a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com text_.c_str() + lastCharIndex - 1); 4436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char); 44486929beca4cb5d81cbad75353b1bb13c6cd6a4bdphilip.liard@gmail.com if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) { 4456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 4506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumber number; 4511fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) != 4526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberUtil::NO_PARSING_ERROR) { 4536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (VerifyAccordingToLeniency(leniency_, number, candidate)) { 4566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com match->set_start(offset); 4576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com match->set_raw_string(candidate); 4581fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // We used ParseAndKeepRawInput to create this number, but for now we don't 4591fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // return the extra values parsed. TODO: stop clearing all values here and 4601fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // switch all users over to using raw_input() rather than the raw_string() 4611fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // of PhoneNumberMatch. 4621fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com number.clear_country_code_source(); 4631fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com number.clear_preferred_domestic_carrier_code(); 4641fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com number.clear_raw_input(); 4656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com match->set_number(number); 4666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 4676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 4706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 4716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// Helper method to replace the verification method for each enum in the Java 4726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com// version. 4736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::VerifyAccordingToLeniency( 4746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com Leniency leniency, const PhoneNumber& number, 4756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com const string& candidate) const { 4766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com switch (leniency) { 4776a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com case PhoneNumberMatcher::POSSIBLE: 4786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return phone_util_.IsPossibleNumber(number); 4796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com case PhoneNumberMatcher::VALID: 4801fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com if (!phone_util_.IsValidNumber(number) || 4811fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com !ContainsOnlyValidXChars(number, candidate, phone_util_)) { 4826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 4841fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return IsNationalPrefixPresentIfRequired(number); 4856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com case PhoneNumberMatcher::STRICT_GROUPING: { 4866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (!phone_util_.IsValidNumber(number) || 4876a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com !ContainsOnlyValidXChars(number, candidate, phone_util_) || 4887e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com ContainsMoreThanOneSlashInNationalNumber( 4897e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com number, candidate, phone_util_) || 4901fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com !IsNationalPrefixPresentIfRequired(number)) { 4916a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 4926a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 493e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 494e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const string&, const vector<string>&>* callback = 495e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com NewPermanentCallback(&AllNumberGroupsRemainGrouped); 496e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); 497e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com delete(callback); 498e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return is_valid; 4996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com case PhoneNumberMatcher::EXACT_GROUPING: { 5016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (!phone_util_.IsValidNumber(number) || 5026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com !ContainsOnlyValidXChars(number, candidate, phone_util_) || 5037e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com ContainsMoreThanOneSlashInNationalNumber( 5047e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com number, candidate, phone_util_) || 5051fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com !IsNationalPrefixPresentIfRequired(number)) { 5066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 5076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 508e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 509e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const string&, const vector<string>&>* callback = 510e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com NewPermanentCallback( 511e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent); 512e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); 513e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com delete(callback); 514e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return is_valid; 5156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com default: 5176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com LOG(ERROR) << "No implementation defined for verification for leniency " 5186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com << static_cast<int>(leniency); 5196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 5206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 5226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 5236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset, 5246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberMatch* match) { 5256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK(match); 5266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Try removing either the first or last "group" in the number and see if this 5276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // gives a result. We consider white space to be a possible indication of 5286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // the start or end of the phone number. 5296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<RegExpInput> candidate_input( 5306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com reg_exps_->regexp_factory_->CreateInput(candidate)); 5316a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), 5326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com NULL)) { 5336a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Try the first group by itself. 5346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com int group_start_index = 5356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com candidate.length() - candidate_input->ToString().length(); 5366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string first_group_only = candidate.substr(0, group_start_index); 5376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com phone_util_.TrimUnwantedEndChars(&first_group_only); 5386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com bool success = ParseAndVerify(first_group_only, offset, match); 5396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (success) { 5406a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 5416a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5426a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com --max_tries_; 5436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 5446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Try the rest of the candidate without the first group. 5456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string without_first_group(candidate_input->ToString()); 5466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com phone_util_.TrimUnwantedEndChars(&without_first_group); 5476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com success = 5486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com ParseAndVerify(without_first_group, offset + group_start_index, match); 5496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (success) { 5506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 5516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com --max_tries_; 5536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 5546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (max_tries_ > 0) { 5556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), 5566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com NULL)) { 5576a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Find the last group. 5586a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5596a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com int last_group_start = 5606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com candidate.length() - candidate_input->ToString().length(); 5616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string without_last_group = candidate.substr(0, last_group_start); 5626a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com phone_util_.TrimUnwantedEndChars(&without_last_group); 5636a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (without_last_group == first_group_only) { 5646a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // If there are only two groups, then the group "without the last group" 5656a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // is the same as the first group. In these cases, we don't want to 5666a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // re-check the number group, so we exit already. 5676a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 5686a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5696a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com success = ParseAndVerify(without_last_group, offset, match); 5706a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (success) { 5716a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 5726a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5736a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com --max_tries_; 5746a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5756a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 5766a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 5776a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 5786a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 5796a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset, 5806a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberMatch* match) { 5816a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK(match); 5826a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Skip a match that is more likely a publication page reference or a date. 5836a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (reg_exps_->pub_pages_->PartialMatch(candidate) || 5846a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com reg_exps_->slash_separated_dates_->PartialMatch(candidate)) { 5856a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 5866a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 587cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com // Skip potential time-stamps. 588cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com if (reg_exps_->time_stamps_->PartialMatch(candidate)) { 589cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com scoped_ptr<RegExpInput> following_text( 590cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com reg_exps_->regexp_factory_->CreateInput( 591cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com text_.substr(offset + candidate.size()))); 592cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) { 593cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com return false; 594cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com } 595cbc255f39ceade5fc9d653e320c511a5f9c51e77philip.liard@gmail.com } 5966a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 5976a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Try to come up with a valid match given the entire candidate. 5986a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (ParseAndVerify(candidate, offset, match)) { 5996a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 6006a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 6016a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 6026a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // If that failed, try to find an "inner match" - there might be a phone 6036a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // number within this candidate. 6046a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return ExtractInnerMatch(candidate, offset, match); 6056a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 6066a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 6076a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::HasNext() { 6086a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (state_ == NOT_READY) { 6096a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PhoneNumberMatch temp_match; 6106a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (!Find(search_index_, &temp_match)) { 6116a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com state_ = DONE; 6126a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } else { 6136a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com last_match_.reset(new PhoneNumberMatch(temp_match.start(), 6146a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com temp_match.raw_string(), 6156a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com temp_match.number())); 6166a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com search_index_ = last_match_->end(); 6176a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com state_ = READY; 6186a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 6196a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 6206a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return state_ == READY; 6216a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 6226a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 6236a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::Next(PhoneNumberMatch* match) { 6246a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK(match); 6256a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Check the state and find the next match as a side-effect if necessary. 6266a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (!HasNext()) { 6276a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 6286a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 6296a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com match->CopyFrom(*last_match_); 6306a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com state_ = NOT_READY; 6316a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com last_match_.reset(NULL); 6326a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 6336a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 6346a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 6356a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.combool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) { 6366a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com DCHECK(match); 6376a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 6386a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com scoped_ptr<RegExpInput> text( 6396a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index))); 6406a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com string candidate; 6416a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com while ((max_tries_ > 0) && 6426a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) { 6436a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com int start = text_.length() - text->ToString().length() - candidate.length(); 6446a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com // Check for extra numbers at the end. 6456a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com reg_exps_->capture_up_to_second_number_start_pattern_-> 6466a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com PartialMatch(candidate, &candidate); 6476a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com if (ExtractMatch(candidate, start, match)) { 6486a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return true; 6496a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 6506a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 6516a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com index = start + candidate.length(); 6526a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com --max_tries_; 6536a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com } 6546a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com return false; 6556a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} 6566a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com 657e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.combool PhoneNumberMatcher::CheckNumberGroupingIsValid( 658e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const PhoneNumber& phone_number, 659e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const string& candidate, 660e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 661e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const string&, const vector<string>&>* checker) const { 662e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com DCHECK(checker); 663e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // TODO: Evaluate how this works for other locales (testing has been limited 664e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // to NANPA regions) and optimise if necessary. 665e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com string normalized_candidate = 666e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com NormalizeUTF8::NormalizeDecimalDigits(candidate); 667e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com vector<string> formatted_number_groups; 668e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern 669e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com &formatted_number_groups); 670e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (checker->Run(phone_util_, phone_number, normalized_candidate, 671e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com formatted_number_groups)) { 672e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return true; 673e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 674b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com // If this didn't pass, see if there are any alternate formats, and try them 675b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com // instead. 676b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com const PhoneMetadata* alternate_formats = 677b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com alternate_formats_->GetAlternateFormatsForCountry( 678b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com phone_number.country_code()); 679b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com if (alternate_formats) { 680b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com for (RepeatedPtrField<NumberFormat>::const_iterator it = 681b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com alternate_formats->number_format().begin(); 682b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com it != alternate_formats->number_format().end(); ++it) { 683b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com formatted_number_groups.clear(); 684b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups); 685b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com if (checker->Run(phone_util_, phone_number, normalized_candidate, 686b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com formatted_number_groups)) { 687b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com return true; 688b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 689b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 690b3485221df548df7eb585b3b321d8035a7570678dbeaumont@google.com } 691e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return false; 692e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com} 693e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com 694e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com// Helper method to get the national-number part of a number, formatted without 695e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com// any national prefix, and return it as a set of digit blocks that would be 696e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com// formatted together. 697e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.comvoid PhoneNumberMatcher::GetNationalNumberGroups( 698e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const PhoneNumber& number, 699e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const NumberFormat* formatting_pattern, 700e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com vector<string>* digit_blocks) const { 701e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com string rfc3966_format; 702e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (!formatting_pattern) { 703e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // This will be in the format +CC-DG;ext=EXT where DG represents groups of 704e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // digits. 705e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); 706e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // We remove the extension part from the formatted string before splitting 707e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // it into different groups. 708e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com size_t end_index = rfc3966_format.find(';'); 709e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (end_index == string::npos) { 710e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com end_index = rfc3966_format.length(); 711e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 712e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // The country-code will have a '-' following it. 713e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com size_t start_index = rfc3966_format.find('-') + 1; 714e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com SplitStringUsing(rfc3966_format.substr(start_index, 715e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com end_index - start_index), 716e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com "-", digit_blocks); 717e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } else { 718e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // We format the NSN only, and split that according to the separator. 719e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com string national_significant_number; 720e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com phone_util_.GetNationalSignificantNumber(number, 721e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com &national_significant_number); 722e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com phone_util_.FormatNsnUsingPattern(national_significant_number, 723e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com *formatting_pattern, 724e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com PhoneNumberUtil::RFC3966, 725e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com &rfc3966_format); 726e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com SplitStringUsing(rfc3966_format, "-", digit_blocks); 727e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 728e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com} 729e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com 7301fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.combool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired( 7311fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com const PhoneNumber& number) const { 7321fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // First, check how we deduced the country code. If it was written in 7331fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // international format, then the national prefix is not required. 7341fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) { 7351fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return true; 7361fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com } 7371fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com string phone_number_region; 7381fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com phone_util_.GetRegionCodeForCountryCode( 7391fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com number.country_code(), &phone_number_region); 7401fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com const PhoneMetadata* metadata = 7411fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com phone_util_.GetMetadataForRegion(phone_number_region); 7421fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com if (!metadata) { 7431fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return true; 7441fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com } 7451fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // Check if a national prefix should be present when formatting this number. 7461fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com string national_number; 7471fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com phone_util_.GetNationalSignificantNumber(number, &national_number); 7481fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com const NumberFormat* format_rule = 7491fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(), 7501fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com national_number); 7511fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // To do this, we check that a national prefix formatting rule was present and 7521fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // that it wasn't just the first-group symbol ($1) with punctuation. 7531fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) { 7541fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com if (format_rule->national_prefix_optional_when_formatting()) { 7551fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // The national-prefix is optional in these cases, so we don't need to 7561fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // check if it was present. 7571fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return true; 7581fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com } 75935bd393fb78215a9c6dbeb158913def01eb58985lararennie@google.com if (phone_util_.FormattingRuleHasFirstGroupOnly( 76035bd393fb78215a9c6dbeb158913def01eb58985lararennie@google.com format_rule->national_prefix_formatting_rule())) { 7611fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // National Prefix not needed for this number. 7621fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return true; 7631fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com } 7641fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // Normalize the remainder. 7651fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com string raw_input_copy(number.raw_input()); 7661fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // Check if we found a national prefix and/or carrier code at the start of 7671fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com // the raw input, and return the result. 7681fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com phone_util_.NormalizeDigitsOnly(&raw_input_copy); 7691fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return phone_util_.MaybeStripNationalPrefixAndCarrierCode( 7701fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com *metadata, 7711fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com &raw_input_copy, 7721fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com NULL); // Don't need to keep the stripped carrier code. 7731fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com } 7741fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com return true; 7751fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com} 7761fb4d23b94da7f0343ce7d177bee350db73e61b5philip.liard@gmail.com 777e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.combool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent( 778e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const PhoneNumberUtil& util, 779e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const PhoneNumber& phone_number, 780e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const string& normalized_candidate, 781e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const vector<string>& formatted_number_groups) const { 782e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com const scoped_ptr<RegExpInput> candidate_number( 783e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com reg_exps_->regexp_factory_->CreateInput(normalized_candidate)); 784e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com vector<string> candidate_groups; 785e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com string digit_block; 786e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume( 787e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com candidate_number.get(), 788e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com &digit_block)) { 789e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com candidate_groups.push_back(digit_block); 790e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 791e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com 792e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // Set this to the last group, skipping it if the number has an extension. 793e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com int candidate_number_group_index = 794e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com phone_number.has_extension() ? candidate_groups.size() - 2 795e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com : candidate_groups.size() - 1; 796e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // First we check if the national significant number is formatted as a block. 797e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // We use find and not equals, since the national significant number may be 798e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // present with a prefix such as a national number prefix, or the country code 799e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // itself. 800e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com string national_significant_number; 801e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com util.GetNationalSignificantNumber(phone_number, 802e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com &national_significant_number); 803e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (candidate_groups.size() == 1 || 804e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com candidate_groups.at(candidate_number_group_index).find( 805e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com national_significant_number) != string::npos) { 806e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return true; 807e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 808e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // Starting from the end, go through in reverse, excluding the first group, 809e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // and check the candidate and number groups are the same. 810e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com for (int formatted_number_group_index = 811e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com (formatted_number_groups.size() - 1); 812e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com formatted_number_group_index > 0 && 813e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com candidate_number_group_index >= 0; 814e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com --formatted_number_group_index, --candidate_number_group_index) { 815e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com if (candidate_groups.at(candidate_number_group_index) != 816e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com formatted_number_groups.at(formatted_number_group_index)) { 817e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return false; 818e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 819e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com } 820e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // Now check the first group. There may be a national prefix at the start, so 821e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // we only check that the candidate group ends with the formatted number 822e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com // group. 823e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com return (candidate_number_group_index >= 0 && 824e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com HasSuffixString(candidate_groups.at(candidate_number_group_index), 825e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com formatted_number_groups.at(0))); 826e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com} 827e71e8316d5660edbb7fe5d39a8a626aaf22b4702philip.liard@gmail.com 8287e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com// static 8297e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.combool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber( 8307e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com const PhoneNumber& number, 8317e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com const string& candidate, 8327e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com const PhoneNumberUtil& util) { 8337e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com size_t first_slash_in_body = candidate.find('/'); 8347e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com if (first_slash_in_body == string::npos) { 8357e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com // No slashes, this is okay. 8367e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com return false; 8377e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com } 8387e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com // Now look for a second one. 8397e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1); 8407e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com if (second_slash_in_body == string::npos) { 8417e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com // Only one slash, this is okay. 8427e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com return false; 8437e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com } 8447e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com 8457e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com // If the first slash is after the country calling code, this is permitted. 8467e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN || 8477e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com number.country_code_source() == 8487e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) { 8497e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com string normalized_country_code = 8507e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com candidate.substr(0, first_slash_in_body); 8517e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com util.NormalizeDigitsOnly(&normalized_country_code); 8527e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com if (normalized_country_code == SimpleItoa(number.country_code())) { 8537e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com // Any more slashes and this is illegal. 8547e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com return candidate.find('/', second_slash_in_body + 1) != string::npos; 8557e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com } 8567e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com } 8577e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com return true; 8587e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com} 8597e77f5f74f097c32c256e8e1270ecbc306fc6567lararennie@google.com 8606a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} // namespace phonenumbers 8616a0a07f4f0066eba2dc9bb81465f4e67d670c7b6philip.liard@gmail.com} // namespace i18n 862