1// Copyright (C) 2011 The Libphonenumber Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Author: Lara Rennie
16// Author: Tao Huang
17//
18// Implementation of a stateful class that finds and extracts telephone numbers
19// from text.
20
21#include "phonenumbers/phonenumbermatcher.h"
22
23#ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP
24#error phonenumbermatcher depends on ICU \
25    (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set)
26#endif  // I18N_PHONENUMBERS_USE_ICU_REGEXP
27
28#include <ctype.h>
29#include <stddef.h>
30#include <limits>
31#include <map>
32#include <string>
33#include <utility>
34#include <vector>
35
36#include <unicode/uchar.h>
37
38#include "phonenumbers/alternate_format.h"
39#include "phonenumbers/base/logging.h"
40#include "phonenumbers/base/memory/scoped_ptr.h"
41#include "phonenumbers/base/memory/singleton.h"
42#include "phonenumbers/callback.h"
43#include "phonenumbers/default_logger.h"
44#include "phonenumbers/encoding_utils.h"
45#include "phonenumbers/normalize_utf8.h"
46#include "phonenumbers/phonemetadata.pb.h"
47#include "phonenumbers/phonenumber.pb.h"
48#include "phonenumbers/phonenumbermatch.h"
49#include "phonenumbers/phonenumberutil.h"
50#include "phonenumbers/regexp_adapter.h"
51#include "phonenumbers/regexp_adapter_icu.h"
52#include "phonenumbers/stringutil.h"
53
54#ifdef I18N_PHONENUMBERS_USE_RE2
55#include "phonenumbers/regexp_adapter_re2.h"
56#endif  // I18N_PHONENUMBERS_USE_RE2_AND_ICU
57
58using std::make_pair;
59using std::map;
60using std::numeric_limits;
61using std::string;
62using std::vector;
63
64namespace i18n {
65namespace phonenumbers {
66
67namespace {
68// Returns a regular expression quantifier with an upper and lower limit.
69string Limit(int lower, int upper) {
70  DCHECK_GE(lower, 0);
71  DCHECK_GT(upper, 0);
72  DCHECK_LT(lower, upper);
73  return StrCat("{", lower, ",", upper, "}");
74}
75
76bool IsInvalidPunctuationSymbol(char32 character) {
77  return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL;
78}
79
80bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
81                             const PhoneNumberUtil& util) {
82  // The characters 'x' and 'X' can be (1) a carrier code, in which case they
83  // always precede the national significant number or (2) an extension sign,
84  // in which case they always precede the extension number. We assume a
85  // carrier code is more than 1 digit, so the first case has to have more than
86  // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
87  // 'x' or 'X'.
88  size_t found;
89  found = candidate.find_first_of("xX");
90  // We ignore the character if 'x' or 'X' appears as the last character of
91  // the string.
92  while (found != string::npos && found < candidate.length() - 1) {
93    // We only look for 'x' or 'X' in ASCII form.
94    char next_char = candidate[found + 1];
95    if (next_char == 'x' || next_char == 'X') {
96      // This is the carrier code case, in which the 'X's always precede the
97      // national significant number.
98      ++found;
99      if (util.IsNumberMatchWithOneString(
100              number, candidate.substr(found, candidate.length() - found))
101          != PhoneNumberUtil::NSN_MATCH) {
102        return false;
103      }
104    } else {
105      string normalized_extension(candidate.substr(found,
106                                                   candidate.length() - found));
107      util.NormalizeDigitsOnly(&normalized_extension);
108      if (normalized_extension != number.extension()) {
109        return false;
110      }
111    }
112    found = candidate.find_first_of("xX", found + 1);
113  }
114  return true;
115}
116
117bool AllNumberGroupsRemainGrouped(
118    const PhoneNumberUtil& util,
119    const PhoneNumber& phone_number,
120    const string& normalized_candidate,
121    const vector<string>& formatted_number_groups) {
122  size_t from_index = 0;
123  // Check each group of consecutive digits are not broken into separate
124  // groupings in the normalized_candidate string.
125  for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
126    // Fails if the substring of normalized_candidate starting from from_index
127    // doesn't contain the consecutive digits in formatted_number_groups.at(i).
128    from_index = normalized_candidate.find(formatted_number_groups.at(i),
129                                           from_index);
130    if (from_index == string::npos) {
131      return false;
132    }
133    // Moves from_index forward.
134    from_index += formatted_number_groups.at(i).length();
135    if (i == 0 && from_index < normalized_candidate.length()) {
136      // We are at the position right after the NDC. We get the region used for
137      // formatting information based on the country code in the phone number,
138      // rather than the number itself, as we do not need to distinguish between
139      // different countries with the same country calling code and this is
140      // faster.
141      string region;
142      util.GetRegionCodeForCountryCode(phone_number.country_code(), &region);
143      string ndd_prefix;
144      util.GetNddPrefixForRegion(region, true, &ndd_prefix);
145      // Note although normalized_candidate might contain non-ASCII formatting
146      // characters, they won't be treated as ASCII digits when converted to a
147      // char.
148      if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) {
149        // This means there is no formatting symbol after the NDC. In this case,
150        // we only accept the number if there is no formatting symbol at all in
151        // the number, except for extensions. This is only important for
152        // countries with national prefixes.
153        string national_significant_number;
154        util.GetNationalSignificantNumber(
155            phone_number, &national_significant_number);
156        return HasPrefixString(normalized_candidate.substr(
157            from_index - formatted_number_groups.at(i).length()),
158            national_significant_number);
159        }
160      }
161    }
162    // The check here makes sure that we haven't mistakenly already used the
163    // extension to match the last group of the subscriber number. Note the
164    // extension cannot have formatting in-between digits.
165    return normalized_candidate.substr(from_index)
166        .find(phone_number.extension()) != string::npos;
167}
168
169bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) {
170#if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS)
171  if (!alternate_formats->ParseFromArray(alternate_format_get(),
172                                         alternate_format_size())) {
173    LOG(ERROR) << "Could not parse binary data.";
174    return false;
175  }
176  return true;
177#else
178  return false;
179#endif
180}
181
182}  // namespace
183
184class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
185 private:
186  friend class Singleton<PhoneNumberMatcherRegExps>;
187
188  string opening_parens_;
189  string closing_parens_;
190  string non_parens_;
191  // Limit on the number of pairs of brackets in a phone number.
192  string bracket_pair_limit_;
193  // Helper strings for the matching_brackets_ pattern.
194  // An opening bracket at the beginning may not be closed, but subsequent ones
195  // should be. It's also possible that the leading bracket was dropped, so we
196  // shouldn't be surprised if we see a closing bracket first.
197  string leading_maybe_matched_bracket_;
198  string bracket_pairs_;
199  // Limit on the number of leading (plus) characters.
200  string lead_limit_;
201  // Limit on the number of consecutive punctuation characters.
202  string punctuation_limit_;
203  // The maximum number of digits allowed in a digit-separated block. As we
204  // allow all digits in a single block, this should be set high enough to
205  // accommodate the entire national number and the international country code.
206  int digit_block_limit_;
207  // Limit on the number of blocks separated by punctuation. Uses
208  // kDigitBlockLimit since some formats use spaces to separate each digit.
209  string block_limit_;
210  // A punctuation sequence allowing white space.
211  string punctuation_;
212  // A digits block without punctuation.
213  string digit_sequence_;
214  // Punctuation that may be at the start of a phone number - brackets and plus
215  // signs.
216  string lead_class_chars_;
217  // Same as lead_class_chars_, but enclosed as a character class.
218  string lead_class_;
219  // Extra helper strings that form part of pattern_. These are stored
220  // separately since StrCat has a limit of 12 args.
221  string opening_punctuation_;
222  string optional_extn_pattern_;
223
224 public:
225  // We use two different reg-ex factories here for performance reasons. RE2 is
226  // much faster for smaller reg-ex patterns, but the main pattern cannot be
227  // handled by RE2 in an efficient way.
228  scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
229  scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
230
231  // Matches strings that look like publication pages. Example:
232  // Computing Complete Answers to Queries in the Presence of Limited Access
233  // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
234  //
235  // The string "211-227 (2003)" is not a telephone number.
236  scoped_ptr<const RegExp> pub_pages_;
237  // Matches strings that look like dates using "/" as a separator. Examples:
238  // 3/10/2011, 31/10/96 or 08/31/95.
239  scoped_ptr<const RegExp> slash_separated_dates_;
240  // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
241  // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_.
242  scoped_ptr<const RegExp> time_stamps_;
243  scoped_ptr<const RegExp> time_stamps_suffix_;
244  // Pattern to check that brackets match. Opening brackets should be closed
245  // within a phone number. This also checks that there is something inside the
246  // brackets. Having no brackets at all is also fine.
247  scoped_ptr<const RegExp> matching_brackets_;
248  // Matches white-space, which may indicate the end of a phone number and the
249  // start of something else (such as a neighbouring zip-code). If white-space
250  // is found, continues to match all characters that are not typically used to
251  // start a phone number.
252  scoped_ptr<const RegExp> group_separator_;
253  scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
254  scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
255  // Compiled reg-ex representing lead_class_;
256  scoped_ptr<const RegExp> lead_class_pattern_;
257  // Phone number pattern allowing optional punctuation.
258  scoped_ptr<const RegExp> pattern_;
259
260  PhoneNumberMatcherRegExps()
261      : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
262        closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
263        non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
264        bracket_pair_limit_(Limit(0, 3)),
265        leading_maybe_matched_bracket_(StrCat(
266            "(?:[", opening_parens_, "])?",
267            "(?:", non_parens_, "+[", closing_parens_, "])?")),
268        bracket_pairs_(StrCat(
269            "(?:[", opening_parens_, "]", non_parens_, "+",
270            "[", closing_parens_, "])", bracket_pair_limit_)),
271        lead_limit_(Limit(0, 2)),
272        punctuation_limit_(Limit(0, 4)),
273        digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
274                           PhoneNumberUtil::kMaxLengthCountryCode),
275        block_limit_(Limit(0, digit_block_limit_)),
276        punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
277                            punctuation_limit_)),
278        digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
279        lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
280        lead_class_(StrCat("[", lead_class_chars_, "]")),
281        opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
282        optional_extn_pattern_(StrCat(
283            "(?i)(?:",
284            PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
285            ")?")),
286        regexp_factory_for_pattern_(new ICURegExpFactory()),
287#ifdef I18N_PHONENUMBERS_USE_RE2
288        regexp_factory_(new RE2RegExpFactory()),
289#else
290        regexp_factory_(new ICURegExpFactory()),
291#endif  // I18N_PHONENUMBERS_USE_RE2
292        pub_pages_(regexp_factory_->CreateRegExp(
293            "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
294        slash_separated_dates_(regexp_factory_->CreateRegExp(
295            "(?:(?:[0-3]?\\d/[01]?\\d)|"
296            "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
297        time_stamps_(regexp_factory_->CreateRegExp(
298            "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")),
299        time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")),
300        matching_brackets_(regexp_factory_->CreateRegExp(
301            StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
302                   bracket_pairs_, non_parens_, "*"))),
303        group_separator_(regexp_factory_->CreateRegExp(
304            StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
305        capture_up_to_second_number_start_pattern_(
306            regexp_factory_->CreateRegExp(
307                PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
308        capturing_ascii_digits_pattern_(
309            regexp_factory_->CreateRegExp("(\\d+)")),
310        lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
311        pattern_(regexp_factory_for_pattern_->CreateRegExp(
312            StrCat("(", opening_punctuation_, lead_limit_,
313                   digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
314                   block_limit_, optional_extn_pattern_, ")"))) {
315  }
316
317 private:
318  DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
319};
320
321class AlternateFormats : public Singleton<AlternateFormats> {
322 public:
323  PhoneMetadataCollection format_data_;
324
325  map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_;
326
327  AlternateFormats()
328      : format_data_(),
329        calling_code_to_alternate_formats_map_() {
330    if (!LoadAlternateFormats(&format_data_)) {
331      LOG(DFATAL) << "Could not parse compiled-in metadata.";
332      return;
333    }
334    for (RepeatedPtrField<PhoneMetadata>::const_iterator it =
335             format_data_.metadata().begin();
336         it != format_data_.metadata().end();
337         ++it) {
338      calling_code_to_alternate_formats_map_.insert(
339          make_pair(it->country_code(), &*it));
340    }
341  }
342
343  const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code)
344      const {
345    map<int, const PhoneMetadata*>::const_iterator it =
346        calling_code_to_alternate_formats_map_.find(country_calling_code);
347    if (it != calling_code_to_alternate_formats_map_.end()) {
348      return it->second;
349    }
350    return NULL;
351  }
352
353 private:
354  DISALLOW_COPY_AND_ASSIGN(AlternateFormats);
355};
356
357PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
358                                       const string& text,
359                                       const string& region_code,
360                                       PhoneNumberMatcher::Leniency leniency,
361                                       int max_tries)
362    : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
363      alternate_formats_(AlternateFormats::GetInstance()),
364      phone_util_(util),
365      text_(text),
366      preferred_region_(region_code),
367      leniency_(leniency),
368      max_tries_(max_tries),
369      state_(NOT_READY),
370      last_match_(NULL),
371      search_index_(0) {
372}
373
374PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
375                                       const string& region_code)
376    : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
377      alternate_formats_(NULL),  // Not used.
378      phone_util_(*PhoneNumberUtil::GetInstance()),
379      text_(text),
380      preferred_region_(region_code),
381      leniency_(VALID),
382      max_tries_(numeric_limits<int>::max()),
383      state_(NOT_READY),
384      last_match_(NULL),
385      search_index_(0) {
386}
387
388PhoneNumberMatcher::~PhoneNumberMatcher() {
389}
390
391// static
392bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
393  // Combining marks are a subset of non-spacing-mark.
394  if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
395    return false;
396  }
397  UBlockCode block = ublock_getCode(letter);
398  return ((block == UBLOCK_BASIC_LATIN) ||
399      (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
400      (block == UBLOCK_LATIN_EXTENDED_A) ||
401      (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
402      (block == UBLOCK_LATIN_EXTENDED_B) ||
403      (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
404}
405
406bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
407                                        PhoneNumberMatch* match) {
408  DCHECK(match);
409  // Check the candidate doesn't contain any formatting which would indicate
410  // that it really isn't a phone number.
411  if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
412    return false;
413  }
414
415  // If leniency is set to VALID or stricter, we also want to skip numbers that
416  // are surrounded by Latin alphabetic characters, to skip cases like
417  // abc8005001234 or 8005001234def.
418  if (leniency_ >= VALID) {
419    // If the candidate is not at the start of the text, and does not start with
420    // phone-number punctuation, check the previous character.
421    scoped_ptr<RegExpInput> candidate_input(
422        reg_exps_->regexp_factory_->CreateInput(candidate));
423    if (offset > 0 &&
424        !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
425      char32 previous_char;
426      const char* previous_char_ptr =
427          EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
428                                                text_.c_str() + offset);
429      EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
430      // We return false if it is a latin letter or an invalid punctuation
431      // symbol.
432      if (IsInvalidPunctuationSymbol(previous_char) ||
433          IsLatinLetter(previous_char)) {
434        return false;
435      }
436    }
437    size_t lastCharIndex = offset + candidate.length();
438    if (lastCharIndex < text_.length()) {
439      char32 next_char;
440      const char* next_char_ptr =
441          EncodingUtils::AdvanceOneUTF8Character(
442              text_.c_str() + lastCharIndex - 1);
443      EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
444      if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) {
445        return false;
446      }
447    }
448  }
449
450  PhoneNumber number;
451  if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) !=
452      PhoneNumberUtil::NO_PARSING_ERROR) {
453    return false;
454  }
455  if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
456    match->set_start(offset);
457    match->set_raw_string(candidate);
458    // We used ParseAndKeepRawInput to create this number, but for now we don't
459    // return the extra values parsed. TODO: stop clearing all values here and
460    // switch all users over to using raw_input() rather than the raw_string()
461    // of PhoneNumberMatch.
462    number.clear_country_code_source();
463    number.clear_preferred_domestic_carrier_code();
464    number.clear_raw_input();
465    match->set_number(number);
466    return true;
467  }
468  return false;
469}
470
471// Helper method to replace the verification method for each enum in the Java
472// version.
473bool PhoneNumberMatcher::VerifyAccordingToLeniency(
474    Leniency leniency, const PhoneNumber& number,
475    const string& candidate) const {
476  switch (leniency) {
477    case PhoneNumberMatcher::POSSIBLE:
478      return phone_util_.IsPossibleNumber(number);
479    case PhoneNumberMatcher::VALID:
480      if (!phone_util_.IsValidNumber(number) ||
481          !ContainsOnlyValidXChars(number, candidate, phone_util_)) {
482        return false;
483      }
484      return IsNationalPrefixPresentIfRequired(number);
485    case PhoneNumberMatcher::STRICT_GROUPING: {
486      if (!phone_util_.IsValidNumber(number) ||
487          !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
488          ContainsMoreThanOneSlashInNationalNumber(
489              number, candidate, phone_util_) ||
490          !IsNationalPrefixPresentIfRequired(number)) {
491        return false;
492      }
493      ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
494                      const string&, const vector<string>&>* callback =
495          NewPermanentCallback(&AllNumberGroupsRemainGrouped);
496      bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
497      delete(callback);
498      return is_valid;
499    }
500    case PhoneNumberMatcher::EXACT_GROUPING: {
501      if (!phone_util_.IsValidNumber(number) ||
502          !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
503          ContainsMoreThanOneSlashInNationalNumber(
504              number, candidate, phone_util_) ||
505          !IsNationalPrefixPresentIfRequired(number)) {
506        return false;
507      }
508      ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
509                      const string&, const vector<string>&>* callback =
510          NewPermanentCallback(
511              this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
512      bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
513      delete(callback);
514      return is_valid;
515    }
516    default:
517      LOG(ERROR) << "No implementation defined for verification for leniency "
518                 << static_cast<int>(leniency);
519      return false;
520  }
521}
522
523bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
524                                           PhoneNumberMatch* match) {
525  DCHECK(match);
526  // Try removing either the first or last "group" in the number and see if this
527  // gives a result. We consider white space to be a possible indication of
528  // the start or end of the phone number.
529  scoped_ptr<RegExpInput> candidate_input(
530      reg_exps_->regexp_factory_->CreateInput(candidate));
531  if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
532                                                  NULL)) {
533    // Try the first group by itself.
534    int group_start_index =
535        candidate.length() - candidate_input->ToString().length();
536    string first_group_only = candidate.substr(0, group_start_index);
537    phone_util_.TrimUnwantedEndChars(&first_group_only);
538    bool success = ParseAndVerify(first_group_only, offset, match);
539    if (success) {
540      return true;
541    }
542    --max_tries_;
543
544    // Try the rest of the candidate without the first group.
545    string without_first_group(candidate_input->ToString());
546    phone_util_.TrimUnwantedEndChars(&without_first_group);
547    success =
548        ParseAndVerify(without_first_group, offset + group_start_index, match);
549    if (success) {
550      return true;
551    }
552    --max_tries_;
553
554    if (max_tries_ > 0) {
555      while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
556                                                         NULL)) {
557        // Find the last group.
558      }
559      int last_group_start =
560          candidate.length() - candidate_input->ToString().length();
561      string without_last_group = candidate.substr(0, last_group_start);
562      phone_util_.TrimUnwantedEndChars(&without_last_group);
563      if (without_last_group == first_group_only) {
564        // If there are only two groups, then the group "without the last group"
565        // is the same as the first group. In these cases, we don't want to
566        // re-check the number group, so we exit already.
567        return false;
568      }
569      success = ParseAndVerify(without_last_group, offset, match);
570      if (success) {
571        return true;
572      }
573      --max_tries_;
574    }
575  }
576  return false;
577}
578
579bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
580                                      PhoneNumberMatch* match) {
581  DCHECK(match);
582  // Skip a match that is more likely a publication page reference or a date.
583  if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
584      reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
585    return false;
586  }
587  // Skip potential time-stamps.
588  if (reg_exps_->time_stamps_->PartialMatch(candidate)) {
589    scoped_ptr<RegExpInput> following_text(
590        reg_exps_->regexp_factory_->CreateInput(
591            text_.substr(offset + candidate.size())));
592    if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) {
593      return false;
594    }
595  }
596
597  // Try to come up with a valid match given the entire candidate.
598  if (ParseAndVerify(candidate, offset, match)) {
599    return true;
600  }
601
602  // If that failed, try to find an "inner match" - there might be a phone
603  // number within this candidate.
604  return ExtractInnerMatch(candidate, offset, match);
605}
606
607bool PhoneNumberMatcher::HasNext() {
608  if (state_ == NOT_READY) {
609    PhoneNumberMatch temp_match;
610    if (!Find(search_index_, &temp_match)) {
611      state_ = DONE;
612    } else {
613      last_match_.reset(new PhoneNumberMatch(temp_match.start(),
614                                             temp_match.raw_string(),
615                                             temp_match.number()));
616      search_index_ = last_match_->end();
617      state_ = READY;
618    }
619  }
620  return state_ == READY;
621}
622
623bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
624  DCHECK(match);
625  // Check the state and find the next match as a side-effect if necessary.
626  if (!HasNext()) {
627    return false;
628  }
629  match->CopyFrom(*last_match_);
630  state_ = NOT_READY;
631  last_match_.reset(NULL);
632  return true;
633}
634
635bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
636  DCHECK(match);
637
638  scoped_ptr<RegExpInput> text(
639      reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
640  string candidate;
641  while ((max_tries_ > 0) &&
642         reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
643    int start = text_.length() - text->ToString().length() - candidate.length();
644    // Check for extra numbers at the end.
645    reg_exps_->capture_up_to_second_number_start_pattern_->
646        PartialMatch(candidate, &candidate);
647    if (ExtractMatch(candidate, start, match)) {
648      return true;
649    }
650
651    index = start + candidate.length();
652    --max_tries_;
653  }
654  return false;
655}
656
657bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
658    const PhoneNumber& phone_number,
659    const string& candidate,
660    ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
661                    const string&, const vector<string>&>* checker) const {
662  DCHECK(checker);
663  // TODO: Evaluate how this works for other locales (testing has been limited
664  // to NANPA regions) and optimise if necessary.
665  string normalized_candidate =
666      NormalizeUTF8::NormalizeDecimalDigits(candidate);
667  vector<string> formatted_number_groups;
668  GetNationalNumberGroups(phone_number, NULL,  // Use default formatting pattern
669                          &formatted_number_groups);
670  if (checker->Run(phone_util_, phone_number, normalized_candidate,
671                   formatted_number_groups)) {
672    return true;
673  }
674  // If this didn't pass, see if there are any alternate formats, and try them
675  // instead.
676  const PhoneMetadata* alternate_formats =
677    alternate_formats_->GetAlternateFormatsForCountry(
678        phone_number.country_code());
679  if (alternate_formats) {
680    for (RepeatedPtrField<NumberFormat>::const_iterator it =
681             alternate_formats->number_format().begin();
682         it != alternate_formats->number_format().end(); ++it) {
683      formatted_number_groups.clear();
684      GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
685      if (checker->Run(phone_util_, phone_number, normalized_candidate,
686                       formatted_number_groups)) {
687        return true;
688      }
689    }
690  }
691  return false;
692}
693
694// Helper method to get the national-number part of a number, formatted without
695// any national prefix, and return it as a set of digit blocks that would be
696// formatted together.
697void PhoneNumberMatcher::GetNationalNumberGroups(
698    const PhoneNumber& number,
699    const NumberFormat* formatting_pattern,
700    vector<string>* digit_blocks) const {
701  string rfc3966_format;
702  if (!formatting_pattern) {
703    // This will be in the format +CC-DG;ext=EXT where DG represents groups of
704    // digits.
705    phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
706    // We remove the extension part from the formatted string before splitting
707    // it into different groups.
708    size_t end_index = rfc3966_format.find(';');
709    if (end_index == string::npos) {
710      end_index = rfc3966_format.length();
711    }
712    // The country-code will have a '-' following it.
713    size_t start_index = rfc3966_format.find('-') + 1;
714    SplitStringUsing(rfc3966_format.substr(start_index,
715                                           end_index - start_index),
716                     "-", digit_blocks);
717  } else {
718    // We format the NSN only, and split that according to the separator.
719    string national_significant_number;
720    phone_util_.GetNationalSignificantNumber(number,
721                                             &national_significant_number);
722    phone_util_.FormatNsnUsingPattern(national_significant_number,
723                                      *formatting_pattern,
724                                      PhoneNumberUtil::RFC3966,
725                                      &rfc3966_format);
726    SplitStringUsing(rfc3966_format, "-", digit_blocks);
727  }
728}
729
730bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
731    const PhoneNumber& number) const {
732  // First, check how we deduced the country code. If it was written in
733  // international format, then the national prefix is not required.
734  if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
735    return true;
736  }
737  string phone_number_region;
738  phone_util_.GetRegionCodeForCountryCode(
739      number.country_code(), &phone_number_region);
740  const PhoneMetadata* metadata =
741      phone_util_.GetMetadataForRegion(phone_number_region);
742  if (!metadata) {
743    return true;
744  }
745  // Check if a national prefix should be present when formatting this number.
746  string national_number;
747  phone_util_.GetNationalSignificantNumber(number, &national_number);
748  const NumberFormat* format_rule =
749      phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(),
750                                                   national_number);
751  // To do this, we check that a national prefix formatting rule was present and
752  // that it wasn't just the first-group symbol ($1) with punctuation.
753  if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) {
754    if (format_rule->national_prefix_optional_when_formatting()) {
755      // The national-prefix is optional in these cases, so we don't need to
756      // check if it was present.
757      return true;
758    }
759    if (phone_util_.FormattingRuleHasFirstGroupOnly(
760        format_rule->national_prefix_formatting_rule())) {
761      // National Prefix not needed for this number.
762      return true;
763    }
764    // Normalize the remainder.
765    string raw_input_copy(number.raw_input());
766    // Check if we found a national prefix and/or carrier code at the start of
767    // the raw input, and return the result.
768    phone_util_.NormalizeDigitsOnly(&raw_input_copy);
769    return phone_util_.MaybeStripNationalPrefixAndCarrierCode(
770        *metadata,
771        &raw_input_copy,
772        NULL);  // Don't need to keep the stripped carrier code.
773  }
774  return true;
775}
776
777bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
778    const PhoneNumberUtil& util,
779    const PhoneNumber& phone_number,
780    const string& normalized_candidate,
781    const vector<string>& formatted_number_groups) const {
782    const scoped_ptr<RegExpInput> candidate_number(
783        reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
784  vector<string> candidate_groups;
785  string digit_block;
786  while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
787             candidate_number.get(),
788             &digit_block)) {
789    candidate_groups.push_back(digit_block);
790  }
791
792  // Set this to the last group, skipping it if the number has an extension.
793  int candidate_number_group_index =
794      phone_number.has_extension() ? candidate_groups.size() - 2
795                                   : candidate_groups.size() - 1;
796  // First we check if the national significant number is formatted as a block.
797  // We use find and not equals, since the national significant number may be
798  // present with a prefix such as a national number prefix, or the country code
799  // itself.
800  string national_significant_number;
801  util.GetNationalSignificantNumber(phone_number,
802                                    &national_significant_number);
803  if (candidate_groups.size() == 1 ||
804      candidate_groups.at(candidate_number_group_index).find(
805          national_significant_number) != string::npos) {
806    return true;
807  }
808  // Starting from the end, go through in reverse, excluding the first group,
809  // and check the candidate and number groups are the same.
810  for (int formatted_number_group_index =
811           (formatted_number_groups.size() - 1);
812       formatted_number_group_index > 0 &&
813       candidate_number_group_index >= 0;
814       --formatted_number_group_index, --candidate_number_group_index) {
815    if (candidate_groups.at(candidate_number_group_index) !=
816        formatted_number_groups.at(formatted_number_group_index)) {
817      return false;
818    }
819  }
820  // Now check the first group. There may be a national prefix at the start, so
821  // we only check that the candidate group ends with the formatted number
822  // group.
823  return (candidate_number_group_index >= 0 &&
824          HasSuffixString(candidate_groups.at(candidate_number_group_index),
825                          formatted_number_groups.at(0)));
826}
827
828// static
829bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber(
830    const PhoneNumber& number,
831    const string& candidate,
832    const PhoneNumberUtil& util) {
833  size_t first_slash_in_body = candidate.find('/');
834  if (first_slash_in_body == string::npos) {
835    // No slashes, this is okay.
836    return false;
837  }
838  // Now look for a second one.
839  size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1);
840  if (second_slash_in_body == string::npos) {
841    // Only one slash, this is okay.
842    return false;
843  }
844
845  // If the first slash is after the country calling code, this is permitted.
846  if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN ||
847      number.country_code_source() ==
848          PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) {
849    string normalized_country_code =
850        candidate.substr(0, first_slash_in_body);
851    util.NormalizeDigitsOnly(&normalized_country_code);
852    if (normalized_country_code == SimpleItoa(number.country_code())) {
853      // Any more slashes and this is illegal.
854      return candidate.find('/', second_slash_in_body + 1) != string::npos;
855    }
856  }
857  return true;
858}
859
860}  // namespace phonenumbers
861}  // namespace i18n
862