1// Copyright (C) 2011 The Libphonenumber Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15// Author: Lara Rennie 16// Author: Tao Huang 17// 18// Implementation of a stateful class that finds and extracts telephone numbers 19// from text. 20 21#include "phonenumbers/phonenumbermatcher.h" 22 23#ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP 24#error phonenumbermatcher depends on ICU \ 25 (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set) 26#endif // I18N_PHONENUMBERS_USE_ICU_REGEXP 27 28#include <ctype.h> 29#include <stddef.h> 30#include <limits> 31#include <map> 32#include <string> 33#include <utility> 34#include <vector> 35 36#include <unicode/uchar.h> 37 38#include "phonenumbers/alternate_format.h" 39#include "phonenumbers/base/logging.h" 40#include "phonenumbers/base/memory/scoped_ptr.h" 41#include "phonenumbers/base/memory/singleton.h" 42#include "phonenumbers/callback.h" 43#include "phonenumbers/default_logger.h" 44#include "phonenumbers/encoding_utils.h" 45#include "phonenumbers/normalize_utf8.h" 46#include "phonenumbers/phonemetadata.pb.h" 47#include "phonenumbers/phonenumber.pb.h" 48#include "phonenumbers/phonenumbermatch.h" 49#include "phonenumbers/phonenumberutil.h" 50#include "phonenumbers/regexp_adapter.h" 51#include "phonenumbers/regexp_adapter_icu.h" 52#include "phonenumbers/stringutil.h" 53 54#ifdef I18N_PHONENUMBERS_USE_RE2 55#include "phonenumbers/regexp_adapter_re2.h" 56#endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU 57 58using std::make_pair; 59using std::map; 60using std::numeric_limits; 61using std::string; 62using std::vector; 63 64namespace i18n { 65namespace phonenumbers { 66 67namespace { 68// Returns a regular expression quantifier with an upper and lower limit. 69string Limit(int lower, int upper) { 70 DCHECK_GE(lower, 0); 71 DCHECK_GT(upper, 0); 72 DCHECK_LT(lower, upper); 73 return StrCat("{", lower, ",", upper, "}"); 74} 75 76bool IsInvalidPunctuationSymbol(char32 character) { 77 return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL; 78} 79 80bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate, 81 const PhoneNumberUtil& util) { 82 // The characters 'x' and 'X' can be (1) a carrier code, in which case they 83 // always precede the national significant number or (2) an extension sign, 84 // in which case they always precede the extension number. We assume a 85 // carrier code is more than 1 digit, so the first case has to have more than 86 // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1 87 // 'x' or 'X'. 88 size_t found; 89 found = candidate.find_first_of("xX"); 90 // We ignore the character if 'x' or 'X' appears as the last character of 91 // the string. 92 while (found != string::npos && found < candidate.length() - 1) { 93 // We only look for 'x' or 'X' in ASCII form. 94 char next_char = candidate[found + 1]; 95 if (next_char == 'x' || next_char == 'X') { 96 // This is the carrier code case, in which the 'X's always precede the 97 // national significant number. 98 ++found; 99 if (util.IsNumberMatchWithOneString( 100 number, candidate.substr(found, candidate.length() - found)) 101 != PhoneNumberUtil::NSN_MATCH) { 102 return false; 103 } 104 } else { 105 string normalized_extension(candidate.substr(found, 106 candidate.length() - found)); 107 util.NormalizeDigitsOnly(&normalized_extension); 108 if (normalized_extension != number.extension()) { 109 return false; 110 } 111 } 112 found = candidate.find_first_of("xX", found + 1); 113 } 114 return true; 115} 116 117bool AllNumberGroupsRemainGrouped( 118 const PhoneNumberUtil& util, 119 const PhoneNumber& phone_number, 120 const string& normalized_candidate, 121 const vector<string>& formatted_number_groups) { 122 size_t from_index = 0; 123 // Check each group of consecutive digits are not broken into separate 124 // groupings in the normalized_candidate string. 125 for (size_t i = 0; i < formatted_number_groups.size(); ++i) { 126 // Fails if the substring of normalized_candidate starting from from_index 127 // doesn't contain the consecutive digits in formatted_number_groups.at(i). 128 from_index = normalized_candidate.find(formatted_number_groups.at(i), 129 from_index); 130 if (from_index == string::npos) { 131 return false; 132 } 133 // Moves from_index forward. 134 from_index += formatted_number_groups.at(i).length(); 135 if (i == 0 && from_index < normalized_candidate.length()) { 136 // We are at the position right after the NDC. We get the region used for 137 // formatting information based on the country code in the phone number, 138 // rather than the number itself, as we do not need to distinguish between 139 // different countries with the same country calling code and this is 140 // faster. 141 string region; 142 util.GetRegionCodeForCountryCode(phone_number.country_code(), ®ion); 143 string ndd_prefix; 144 util.GetNddPrefixForRegion(region, true, &ndd_prefix); 145 // Note although normalized_candidate might contain non-ASCII formatting 146 // characters, they won't be treated as ASCII digits when converted to a 147 // char. 148 if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) { 149 // This means there is no formatting symbol after the NDC. In this case, 150 // we only accept the number if there is no formatting symbol at all in 151 // the number, except for extensions. This is only important for 152 // countries with national prefixes. 153 string national_significant_number; 154 util.GetNationalSignificantNumber( 155 phone_number, &national_significant_number); 156 return HasPrefixString(normalized_candidate.substr( 157 from_index - formatted_number_groups.at(i).length()), 158 national_significant_number); 159 } 160 } 161 } 162 // The check here makes sure that we haven't mistakenly already used the 163 // extension to match the last group of the subscriber number. Note the 164 // extension cannot have formatting in-between digits. 165 return normalized_candidate.substr(from_index) 166 .find(phone_number.extension()) != string::npos; 167} 168 169bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) { 170#if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS) 171 if (!alternate_formats->ParseFromArray(alternate_format_get(), 172 alternate_format_size())) { 173 LOG(ERROR) << "Could not parse binary data."; 174 return false; 175 } 176 return true; 177#else 178 return false; 179#endif 180} 181 182} // namespace 183 184class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> { 185 private: 186 friend class Singleton<PhoneNumberMatcherRegExps>; 187 188 string opening_parens_; 189 string closing_parens_; 190 string non_parens_; 191 // Limit on the number of pairs of brackets in a phone number. 192 string bracket_pair_limit_; 193 // Helper strings for the matching_brackets_ pattern. 194 // An opening bracket at the beginning may not be closed, but subsequent ones 195 // should be. It's also possible that the leading bracket was dropped, so we 196 // shouldn't be surprised if we see a closing bracket first. 197 string leading_maybe_matched_bracket_; 198 string bracket_pairs_; 199 // Limit on the number of leading (plus) characters. 200 string lead_limit_; 201 // Limit on the number of consecutive punctuation characters. 202 string punctuation_limit_; 203 // The maximum number of digits allowed in a digit-separated block. As we 204 // allow all digits in a single block, this should be set high enough to 205 // accommodate the entire national number and the international country code. 206 int digit_block_limit_; 207 // Limit on the number of blocks separated by punctuation. Uses 208 // kDigitBlockLimit since some formats use spaces to separate each digit. 209 string block_limit_; 210 // A punctuation sequence allowing white space. 211 string punctuation_; 212 // A digits block without punctuation. 213 string digit_sequence_; 214 // Punctuation that may be at the start of a phone number - brackets and plus 215 // signs. 216 string lead_class_chars_; 217 // Same as lead_class_chars_, but enclosed as a character class. 218 string lead_class_; 219 // Extra helper strings that form part of pattern_. These are stored 220 // separately since StrCat has a limit of 12 args. 221 string opening_punctuation_; 222 string optional_extn_pattern_; 223 224 public: 225 // We use two different reg-ex factories here for performance reasons. RE2 is 226 // much faster for smaller reg-ex patterns, but the main pattern cannot be 227 // handled by RE2 in an efficient way. 228 scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_; 229 scoped_ptr<const AbstractRegExpFactory> regexp_factory_; 230 231 // Matches strings that look like publication pages. Example: 232 // Computing Complete Answers to Queries in the Presence of Limited Access 233 // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003). 234 // 235 // The string "211-227 (2003)" is not a telephone number. 236 scoped_ptr<const RegExp> pub_pages_; 237 // Matches strings that look like dates using "/" as a separator. Examples: 238 // 3/10/2011, 31/10/96 or 08/31/95. 239 scoped_ptr<const RegExp> slash_separated_dates_; 240 // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does 241 // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_. 242 scoped_ptr<const RegExp> time_stamps_; 243 scoped_ptr<const RegExp> time_stamps_suffix_; 244 // Pattern to check that brackets match. Opening brackets should be closed 245 // within a phone number. This also checks that there is something inside the 246 // brackets. Having no brackets at all is also fine. 247 scoped_ptr<const RegExp> matching_brackets_; 248 // Matches white-space, which may indicate the end of a phone number and the 249 // start of something else (such as a neighbouring zip-code). If white-space 250 // is found, continues to match all characters that are not typically used to 251 // start a phone number. 252 scoped_ptr<const RegExp> group_separator_; 253 scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_; 254 scoped_ptr<const RegExp> capturing_ascii_digits_pattern_; 255 // Compiled reg-ex representing lead_class_; 256 scoped_ptr<const RegExp> lead_class_pattern_; 257 // Phone number pattern allowing optional punctuation. 258 scoped_ptr<const RegExp> pattern_; 259 260 PhoneNumberMatcherRegExps() 261 : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */), 262 closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */), 263 non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")), 264 bracket_pair_limit_(Limit(0, 3)), 265 leading_maybe_matched_bracket_(StrCat( 266 "(?:[", opening_parens_, "])?", 267 "(?:", non_parens_, "+[", closing_parens_, "])?")), 268 bracket_pairs_(StrCat( 269 "(?:[", opening_parens_, "]", non_parens_, "+", 270 "[", closing_parens_, "])", bracket_pair_limit_)), 271 lead_limit_(Limit(0, 2)), 272 punctuation_limit_(Limit(0, 4)), 273 digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn + 274 PhoneNumberUtil::kMaxLengthCountryCode), 275 block_limit_(Limit(0, digit_block_limit_)), 276 punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]", 277 punctuation_limit_)), 278 digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))), 279 lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)), 280 lead_class_(StrCat("[", lead_class_chars_, "]")), 281 opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")), 282 optional_extn_pattern_(StrCat( 283 "(?i)(?:", 284 PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), 285 ")?")), 286 regexp_factory_for_pattern_(new ICURegExpFactory()), 287#ifdef I18N_PHONENUMBERS_USE_RE2 288 regexp_factory_(new RE2RegExpFactory()), 289#else 290 regexp_factory_(new ICURegExpFactory()), 291#endif // I18N_PHONENUMBERS_USE_RE2 292 pub_pages_(regexp_factory_->CreateRegExp( 293 "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")), 294 slash_separated_dates_(regexp_factory_->CreateRegExp( 295 "(?:(?:[0-3]?\\d/[01]?\\d)|" 296 "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")), 297 time_stamps_(regexp_factory_->CreateRegExp( 298 "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")), 299 time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")), 300 matching_brackets_(regexp_factory_->CreateRegExp( 301 StrCat(leading_maybe_matched_bracket_, non_parens_, "+", 302 bracket_pairs_, non_parens_, "*"))), 303 group_separator_(regexp_factory_->CreateRegExp( 304 StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))), 305 capture_up_to_second_number_start_pattern_( 306 regexp_factory_->CreateRegExp( 307 PhoneNumberUtil::kCaptureUpToSecondNumberStart)), 308 capturing_ascii_digits_pattern_( 309 regexp_factory_->CreateRegExp("(\\d+)")), 310 lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)), 311 pattern_(regexp_factory_for_pattern_->CreateRegExp( 312 StrCat("(", opening_punctuation_, lead_limit_, 313 digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", 314 block_limit_, optional_extn_pattern_, ")"))) { 315 } 316 317 private: 318 DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps); 319}; 320 321class AlternateFormats : public Singleton<AlternateFormats> { 322 public: 323 PhoneMetadataCollection format_data_; 324 325 map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_; 326 327 AlternateFormats() 328 : format_data_(), 329 calling_code_to_alternate_formats_map_() { 330 if (!LoadAlternateFormats(&format_data_)) { 331 LOG(DFATAL) << "Could not parse compiled-in metadata."; 332 return; 333 } 334 for (RepeatedPtrField<PhoneMetadata>::const_iterator it = 335 format_data_.metadata().begin(); 336 it != format_data_.metadata().end(); 337 ++it) { 338 calling_code_to_alternate_formats_map_.insert( 339 make_pair(it->country_code(), &*it)); 340 } 341 } 342 343 const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code) 344 const { 345 map<int, const PhoneMetadata*>::const_iterator it = 346 calling_code_to_alternate_formats_map_.find(country_calling_code); 347 if (it != calling_code_to_alternate_formats_map_.end()) { 348 return it->second; 349 } 350 return NULL; 351 } 352 353 private: 354 DISALLOW_COPY_AND_ASSIGN(AlternateFormats); 355}; 356 357PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util, 358 const string& text, 359 const string& region_code, 360 PhoneNumberMatcher::Leniency leniency, 361 int max_tries) 362 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), 363 alternate_formats_(AlternateFormats::GetInstance()), 364 phone_util_(util), 365 text_(text), 366 preferred_region_(region_code), 367 leniency_(leniency), 368 max_tries_(max_tries), 369 state_(NOT_READY), 370 last_match_(NULL), 371 search_index_(0) { 372} 373 374PhoneNumberMatcher::PhoneNumberMatcher(const string& text, 375 const string& region_code) 376 : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), 377 alternate_formats_(NULL), // Not used. 378 phone_util_(*PhoneNumberUtil::GetInstance()), 379 text_(text), 380 preferred_region_(region_code), 381 leniency_(VALID), 382 max_tries_(numeric_limits<int>::max()), 383 state_(NOT_READY), 384 last_match_(NULL), 385 search_index_(0) { 386} 387 388PhoneNumberMatcher::~PhoneNumberMatcher() { 389} 390 391// static 392bool PhoneNumberMatcher::IsLatinLetter(char32 letter) { 393 // Combining marks are a subset of non-spacing-mark. 394 if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) { 395 return false; 396 } 397 UBlockCode block = ublock_getCode(letter); 398 return ((block == UBLOCK_BASIC_LATIN) || 399 (block == UBLOCK_LATIN_1_SUPPLEMENT) || 400 (block == UBLOCK_LATIN_EXTENDED_A) || 401 (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) || 402 (block == UBLOCK_LATIN_EXTENDED_B) || 403 (block == UBLOCK_COMBINING_DIACRITICAL_MARKS)); 404} 405 406bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset, 407 PhoneNumberMatch* match) { 408 DCHECK(match); 409 // Check the candidate doesn't contain any formatting which would indicate 410 // that it really isn't a phone number. 411 if (!reg_exps_->matching_brackets_->FullMatch(candidate)) { 412 return false; 413 } 414 415 // If leniency is set to VALID or stricter, we also want to skip numbers that 416 // are surrounded by Latin alphabetic characters, to skip cases like 417 // abc8005001234 or 8005001234def. 418 if (leniency_ >= VALID) { 419 // If the candidate is not at the start of the text, and does not start with 420 // phone-number punctuation, check the previous character. 421 scoped_ptr<RegExpInput> candidate_input( 422 reg_exps_->regexp_factory_->CreateInput(candidate)); 423 if (offset > 0 && 424 !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) { 425 char32 previous_char; 426 const char* previous_char_ptr = 427 EncodingUtils::BackUpOneUTF8Character(text_.c_str(), 428 text_.c_str() + offset); 429 EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char); 430 // We return false if it is a latin letter or an invalid punctuation 431 // symbol. 432 if (IsInvalidPunctuationSymbol(previous_char) || 433 IsLatinLetter(previous_char)) { 434 return false; 435 } 436 } 437 size_t lastCharIndex = offset + candidate.length(); 438 if (lastCharIndex < text_.length()) { 439 char32 next_char; 440 const char* next_char_ptr = 441 EncodingUtils::AdvanceOneUTF8Character( 442 text_.c_str() + lastCharIndex - 1); 443 EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char); 444 if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) { 445 return false; 446 } 447 } 448 } 449 450 PhoneNumber number; 451 if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) != 452 PhoneNumberUtil::NO_PARSING_ERROR) { 453 return false; 454 } 455 if (VerifyAccordingToLeniency(leniency_, number, candidate)) { 456 match->set_start(offset); 457 match->set_raw_string(candidate); 458 // We used ParseAndKeepRawInput to create this number, but for now we don't 459 // return the extra values parsed. TODO: stop clearing all values here and 460 // switch all users over to using raw_input() rather than the raw_string() 461 // of PhoneNumberMatch. 462 number.clear_country_code_source(); 463 number.clear_preferred_domestic_carrier_code(); 464 number.clear_raw_input(); 465 match->set_number(number); 466 return true; 467 } 468 return false; 469} 470 471// Helper method to replace the verification method for each enum in the Java 472// version. 473bool PhoneNumberMatcher::VerifyAccordingToLeniency( 474 Leniency leniency, const PhoneNumber& number, 475 const string& candidate) const { 476 switch (leniency) { 477 case PhoneNumberMatcher::POSSIBLE: 478 return phone_util_.IsPossibleNumber(number); 479 case PhoneNumberMatcher::VALID: 480 if (!phone_util_.IsValidNumber(number) || 481 !ContainsOnlyValidXChars(number, candidate, phone_util_)) { 482 return false; 483 } 484 return IsNationalPrefixPresentIfRequired(number); 485 case PhoneNumberMatcher::STRICT_GROUPING: { 486 if (!phone_util_.IsValidNumber(number) || 487 !ContainsOnlyValidXChars(number, candidate, phone_util_) || 488 ContainsMoreThanOneSlashInNationalNumber( 489 number, candidate, phone_util_) || 490 !IsNationalPrefixPresentIfRequired(number)) { 491 return false; 492 } 493 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 494 const string&, const vector<string>&>* callback = 495 NewPermanentCallback(&AllNumberGroupsRemainGrouped); 496 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); 497 delete(callback); 498 return is_valid; 499 } 500 case PhoneNumberMatcher::EXACT_GROUPING: { 501 if (!phone_util_.IsValidNumber(number) || 502 !ContainsOnlyValidXChars(number, candidate, phone_util_) || 503 ContainsMoreThanOneSlashInNationalNumber( 504 number, candidate, phone_util_) || 505 !IsNationalPrefixPresentIfRequired(number)) { 506 return false; 507 } 508 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 509 const string&, const vector<string>&>* callback = 510 NewPermanentCallback( 511 this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent); 512 bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); 513 delete(callback); 514 return is_valid; 515 } 516 default: 517 LOG(ERROR) << "No implementation defined for verification for leniency " 518 << static_cast<int>(leniency); 519 return false; 520 } 521} 522 523bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset, 524 PhoneNumberMatch* match) { 525 DCHECK(match); 526 // Try removing either the first or last "group" in the number and see if this 527 // gives a result. We consider white space to be a possible indication of 528 // the start or end of the phone number. 529 scoped_ptr<RegExpInput> candidate_input( 530 reg_exps_->regexp_factory_->CreateInput(candidate)); 531 if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), 532 NULL)) { 533 // Try the first group by itself. 534 int group_start_index = 535 candidate.length() - candidate_input->ToString().length(); 536 string first_group_only = candidate.substr(0, group_start_index); 537 phone_util_.TrimUnwantedEndChars(&first_group_only); 538 bool success = ParseAndVerify(first_group_only, offset, match); 539 if (success) { 540 return true; 541 } 542 --max_tries_; 543 544 // Try the rest of the candidate without the first group. 545 string without_first_group(candidate_input->ToString()); 546 phone_util_.TrimUnwantedEndChars(&without_first_group); 547 success = 548 ParseAndVerify(without_first_group, offset + group_start_index, match); 549 if (success) { 550 return true; 551 } 552 --max_tries_; 553 554 if (max_tries_ > 0) { 555 while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), 556 NULL)) { 557 // Find the last group. 558 } 559 int last_group_start = 560 candidate.length() - candidate_input->ToString().length(); 561 string without_last_group = candidate.substr(0, last_group_start); 562 phone_util_.TrimUnwantedEndChars(&without_last_group); 563 if (without_last_group == first_group_only) { 564 // If there are only two groups, then the group "without the last group" 565 // is the same as the first group. In these cases, we don't want to 566 // re-check the number group, so we exit already. 567 return false; 568 } 569 success = ParseAndVerify(without_last_group, offset, match); 570 if (success) { 571 return true; 572 } 573 --max_tries_; 574 } 575 } 576 return false; 577} 578 579bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset, 580 PhoneNumberMatch* match) { 581 DCHECK(match); 582 // Skip a match that is more likely a publication page reference or a date. 583 if (reg_exps_->pub_pages_->PartialMatch(candidate) || 584 reg_exps_->slash_separated_dates_->PartialMatch(candidate)) { 585 return false; 586 } 587 // Skip potential time-stamps. 588 if (reg_exps_->time_stamps_->PartialMatch(candidate)) { 589 scoped_ptr<RegExpInput> following_text( 590 reg_exps_->regexp_factory_->CreateInput( 591 text_.substr(offset + candidate.size()))); 592 if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) { 593 return false; 594 } 595 } 596 597 // Try to come up with a valid match given the entire candidate. 598 if (ParseAndVerify(candidate, offset, match)) { 599 return true; 600 } 601 602 // If that failed, try to find an "inner match" - there might be a phone 603 // number within this candidate. 604 return ExtractInnerMatch(candidate, offset, match); 605} 606 607bool PhoneNumberMatcher::HasNext() { 608 if (state_ == NOT_READY) { 609 PhoneNumberMatch temp_match; 610 if (!Find(search_index_, &temp_match)) { 611 state_ = DONE; 612 } else { 613 last_match_.reset(new PhoneNumberMatch(temp_match.start(), 614 temp_match.raw_string(), 615 temp_match.number())); 616 search_index_ = last_match_->end(); 617 state_ = READY; 618 } 619 } 620 return state_ == READY; 621} 622 623bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) { 624 DCHECK(match); 625 // Check the state and find the next match as a side-effect if necessary. 626 if (!HasNext()) { 627 return false; 628 } 629 match->CopyFrom(*last_match_); 630 state_ = NOT_READY; 631 last_match_.reset(NULL); 632 return true; 633} 634 635bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) { 636 DCHECK(match); 637 638 scoped_ptr<RegExpInput> text( 639 reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index))); 640 string candidate; 641 while ((max_tries_ > 0) && 642 reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) { 643 int start = text_.length() - text->ToString().length() - candidate.length(); 644 // Check for extra numbers at the end. 645 reg_exps_->capture_up_to_second_number_start_pattern_-> 646 PartialMatch(candidate, &candidate); 647 if (ExtractMatch(candidate, start, match)) { 648 return true; 649 } 650 651 index = start + candidate.length(); 652 --max_tries_; 653 } 654 return false; 655} 656 657bool PhoneNumberMatcher::CheckNumberGroupingIsValid( 658 const PhoneNumber& phone_number, 659 const string& candidate, 660 ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, 661 const string&, const vector<string>&>* checker) const { 662 DCHECK(checker); 663 // TODO: Evaluate how this works for other locales (testing has been limited 664 // to NANPA regions) and optimise if necessary. 665 string normalized_candidate = 666 NormalizeUTF8::NormalizeDecimalDigits(candidate); 667 vector<string> formatted_number_groups; 668 GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern 669 &formatted_number_groups); 670 if (checker->Run(phone_util_, phone_number, normalized_candidate, 671 formatted_number_groups)) { 672 return true; 673 } 674 // If this didn't pass, see if there are any alternate formats, and try them 675 // instead. 676 const PhoneMetadata* alternate_formats = 677 alternate_formats_->GetAlternateFormatsForCountry( 678 phone_number.country_code()); 679 if (alternate_formats) { 680 for (RepeatedPtrField<NumberFormat>::const_iterator it = 681 alternate_formats->number_format().begin(); 682 it != alternate_formats->number_format().end(); ++it) { 683 formatted_number_groups.clear(); 684 GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups); 685 if (checker->Run(phone_util_, phone_number, normalized_candidate, 686 formatted_number_groups)) { 687 return true; 688 } 689 } 690 } 691 return false; 692} 693 694// Helper method to get the national-number part of a number, formatted without 695// any national prefix, and return it as a set of digit blocks that would be 696// formatted together. 697void PhoneNumberMatcher::GetNationalNumberGroups( 698 const PhoneNumber& number, 699 const NumberFormat* formatting_pattern, 700 vector<string>* digit_blocks) const { 701 string rfc3966_format; 702 if (!formatting_pattern) { 703 // This will be in the format +CC-DG;ext=EXT where DG represents groups of 704 // digits. 705 phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); 706 // We remove the extension part from the formatted string before splitting 707 // it into different groups. 708 size_t end_index = rfc3966_format.find(';'); 709 if (end_index == string::npos) { 710 end_index = rfc3966_format.length(); 711 } 712 // The country-code will have a '-' following it. 713 size_t start_index = rfc3966_format.find('-') + 1; 714 SplitStringUsing(rfc3966_format.substr(start_index, 715 end_index - start_index), 716 "-", digit_blocks); 717 } else { 718 // We format the NSN only, and split that according to the separator. 719 string national_significant_number; 720 phone_util_.GetNationalSignificantNumber(number, 721 &national_significant_number); 722 phone_util_.FormatNsnUsingPattern(national_significant_number, 723 *formatting_pattern, 724 PhoneNumberUtil::RFC3966, 725 &rfc3966_format); 726 SplitStringUsing(rfc3966_format, "-", digit_blocks); 727 } 728} 729 730bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired( 731 const PhoneNumber& number) const { 732 // First, check how we deduced the country code. If it was written in 733 // international format, then the national prefix is not required. 734 if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) { 735 return true; 736 } 737 string phone_number_region; 738 phone_util_.GetRegionCodeForCountryCode( 739 number.country_code(), &phone_number_region); 740 const PhoneMetadata* metadata = 741 phone_util_.GetMetadataForRegion(phone_number_region); 742 if (!metadata) { 743 return true; 744 } 745 // Check if a national prefix should be present when formatting this number. 746 string national_number; 747 phone_util_.GetNationalSignificantNumber(number, &national_number); 748 const NumberFormat* format_rule = 749 phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(), 750 national_number); 751 // To do this, we check that a national prefix formatting rule was present and 752 // that it wasn't just the first-group symbol ($1) with punctuation. 753 if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) { 754 if (format_rule->national_prefix_optional_when_formatting()) { 755 // The national-prefix is optional in these cases, so we don't need to 756 // check if it was present. 757 return true; 758 } 759 if (phone_util_.FormattingRuleHasFirstGroupOnly( 760 format_rule->national_prefix_formatting_rule())) { 761 // National Prefix not needed for this number. 762 return true; 763 } 764 // Normalize the remainder. 765 string raw_input_copy(number.raw_input()); 766 // Check if we found a national prefix and/or carrier code at the start of 767 // the raw input, and return the result. 768 phone_util_.NormalizeDigitsOnly(&raw_input_copy); 769 return phone_util_.MaybeStripNationalPrefixAndCarrierCode( 770 *metadata, 771 &raw_input_copy, 772 NULL); // Don't need to keep the stripped carrier code. 773 } 774 return true; 775} 776 777bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent( 778 const PhoneNumberUtil& util, 779 const PhoneNumber& phone_number, 780 const string& normalized_candidate, 781 const vector<string>& formatted_number_groups) const { 782 const scoped_ptr<RegExpInput> candidate_number( 783 reg_exps_->regexp_factory_->CreateInput(normalized_candidate)); 784 vector<string> candidate_groups; 785 string digit_block; 786 while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume( 787 candidate_number.get(), 788 &digit_block)) { 789 candidate_groups.push_back(digit_block); 790 } 791 792 // Set this to the last group, skipping it if the number has an extension. 793 int candidate_number_group_index = 794 phone_number.has_extension() ? candidate_groups.size() - 2 795 : candidate_groups.size() - 1; 796 // First we check if the national significant number is formatted as a block. 797 // We use find and not equals, since the national significant number may be 798 // present with a prefix such as a national number prefix, or the country code 799 // itself. 800 string national_significant_number; 801 util.GetNationalSignificantNumber(phone_number, 802 &national_significant_number); 803 if (candidate_groups.size() == 1 || 804 candidate_groups.at(candidate_number_group_index).find( 805 national_significant_number) != string::npos) { 806 return true; 807 } 808 // Starting from the end, go through in reverse, excluding the first group, 809 // and check the candidate and number groups are the same. 810 for (int formatted_number_group_index = 811 (formatted_number_groups.size() - 1); 812 formatted_number_group_index > 0 && 813 candidate_number_group_index >= 0; 814 --formatted_number_group_index, --candidate_number_group_index) { 815 if (candidate_groups.at(candidate_number_group_index) != 816 formatted_number_groups.at(formatted_number_group_index)) { 817 return false; 818 } 819 } 820 // Now check the first group. There may be a national prefix at the start, so 821 // we only check that the candidate group ends with the formatted number 822 // group. 823 return (candidate_number_group_index >= 0 && 824 HasSuffixString(candidate_groups.at(candidate_number_group_index), 825 formatted_number_groups.at(0))); 826} 827 828// static 829bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber( 830 const PhoneNumber& number, 831 const string& candidate, 832 const PhoneNumberUtil& util) { 833 size_t first_slash_in_body = candidate.find('/'); 834 if (first_slash_in_body == string::npos) { 835 // No slashes, this is okay. 836 return false; 837 } 838 // Now look for a second one. 839 size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1); 840 if (second_slash_in_body == string::npos) { 841 // Only one slash, this is okay. 842 return false; 843 } 844 845 // If the first slash is after the country calling code, this is permitted. 846 if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN || 847 number.country_code_source() == 848 PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) { 849 string normalized_country_code = 850 candidate.substr(0, first_slash_in_body); 851 util.NormalizeDigitsOnly(&normalized_country_code); 852 if (normalized_country_code == SimpleItoa(number.country_code())) { 853 // Any more slashes and this is illegal. 854 return candidate.find('/', second_slash_in_body + 1) != string::npos; 855 } 856 } 857 return true; 858} 859 860} // namespace phonenumbers 861} // namespace i18n 862