1// Copyright (C) 2011 The Libphonenumber Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Author: George Yakovlev
16//         Philippe Liard
17
18// Note that we don't use features of ICU that depend on std::string (e.g.
19// UnicodeString::toUTF8String()) to support clients that build ICU without
20// -DU_HAVE_STD_STRING.
21
22#include "phonenumbers/regexp_adapter_icu.h"
23
24#include <stddef.h>
25#include <string>
26
27#include <unicode/regex.h>
28#include <unicode/stringpiece.h>
29#include <unicode/unistr.h>
30
31#include "phonenumbers/base/basictypes.h"
32#include "phonenumbers/base/logging.h"
33#include "phonenumbers/base/memory/scoped_ptr.h"
34#include "phonenumbers/default_logger.h"
35#include "phonenumbers/string_byte_sink.h"
36
37namespace i18n {
38namespace phonenumbers {
39
40using icu::RegexMatcher;
41using icu::RegexPattern;
42using icu::UnicodeString;
43
44namespace {
45
46// Converts UnicodeString 'source' to a UTF8-formatted std::string.
47string UnicodeStringToUtf8String(const UnicodeString& source) {
48  string data;
49  StringByteSink sink(&data);
50  source.toUTF8(sink);
51  return data;
52}
53
54// Converts UTF8-formatted std::string 'source' to a UnicodeString.
55UnicodeString Utf8StringToUnicodeString(const string& source) {
56  // Note that we don't use icu::StringPiece(const string&).
57  return UnicodeString::fromUTF8(
58      icu::StringPiece(source.c_str(), source.size()));
59}
60
61}  // namespace
62
63// Implementation of the abstract classes RegExpInput and RegExp using ICU
64// regular expression capabilities.
65
66// ICU implementation of the RegExpInput abstract class.
67class IcuRegExpInput : public RegExpInput {
68 public:
69  explicit IcuRegExpInput(const string& utf8_input)
70      : utf8_input_(Utf8StringToUnicodeString(utf8_input)),
71        position_(0) {}
72
73  virtual ~IcuRegExpInput() {}
74
75  virtual string ToString() const {
76    return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
77  }
78
79  UnicodeString* Data() {
80    return &utf8_input_;
81  }
82
83  // The current start position. For a newly created input, position is 0. Each
84  // call to ConsumeRegExp() or RegExp::Consume() advances the position in the
85  // case of the successful match to be after the match.
86  int position() const {
87    return position_;
88  }
89
90  void set_position(int position) {
91    DCHECK(position >= 0 && position <= utf8_input_.length());
92    position_ = position;
93  }
94
95 private:
96  UnicodeString utf8_input_;
97  int position_;
98
99  DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
100};
101
102// ICU implementation of the RegExp abstract class.
103class IcuRegExp : public RegExp {
104 public:
105  explicit IcuRegExp(const string& utf8_regexp) {
106    UParseError parse_error;
107    UErrorCode status = U_ZERO_ERROR;
108    utf8_regexp_.reset(RegexPattern::compile(
109        Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
110    if (U_FAILURE(status)) {
111      // The provided regular expressions should compile correctly.
112      LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
113      utf8_regexp_.reset(NULL);
114    }
115  }
116
117  virtual ~IcuRegExp() {}
118
119  virtual bool Consume(RegExpInput* input_string,
120                       bool anchor_at_start,
121                       string* matched_string1,
122                       string* matched_string2,
123                       string* matched_string3) const {
124    DCHECK(input_string);
125    if (!utf8_regexp_.get()) {
126      return false;
127    }
128    IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
129    UErrorCode status = U_ZERO_ERROR;
130    const scoped_ptr<RegexMatcher> matcher(
131        utf8_regexp_->matcher(*input->Data(), status));
132    bool match_succeeded = anchor_at_start
133        ? matcher->lookingAt(input->position(), status)
134        : matcher->find(input->position(), status);
135    if (!match_succeeded || U_FAILURE(status)) {
136      return false;
137    }
138    string* const matched_strings[] = {
139      matched_string1, matched_string2, matched_string3
140    };
141    // If less matches than expected - fail.
142    for (size_t i = 0; i < arraysize(matched_strings); ++i) {
143      if (matched_strings[i]) {
144        // Groups are counted from 1 rather than 0.
145        const int group_index = i + 1;
146        if (group_index > matcher->groupCount()) {
147          return false;
148        }
149        *matched_strings[i] =
150            UnicodeStringToUtf8String(matcher->group(group_index, status));
151      }
152    }
153    input->set_position(matcher->end(status));
154    return !U_FAILURE(status);
155  }
156
157  bool Match(const string& input_string,
158             bool full_match,
159             string* matched_string) const {
160    if (!utf8_regexp_.get()) {
161      return false;
162    }
163    IcuRegExpInput input(input_string);
164    UErrorCode status = U_ZERO_ERROR;
165    const scoped_ptr<RegexMatcher> matcher(
166        utf8_regexp_->matcher(*input.Data(), status));
167    bool match_succeeded = full_match
168        ? matcher->matches(input.position(), status)
169        : matcher->find(input.position(), status);
170    if (!match_succeeded || U_FAILURE(status)) {
171      return false;
172    }
173    if (matcher->groupCount() > 0 && matched_string) {
174      *matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
175    }
176    return !U_FAILURE(status);
177  }
178
179  bool Replace(string* string_to_process,
180               bool global,
181               const string& replacement_string) const {
182    DCHECK(string_to_process);
183    if (!utf8_regexp_.get()) {
184      return false;
185    }
186    IcuRegExpInput input(*string_to_process);
187    UErrorCode status = U_ZERO_ERROR;
188    const scoped_ptr<RegexMatcher> matcher(
189        utf8_regexp_->matcher(*input.Data(), status));
190    if (U_FAILURE(status)) {
191      return false;
192    }
193
194    UnicodeString output;
195    // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
196    // consistent with the RE2 reg-ex matcher.
197    if (!matcher->find()) {
198      return false;
199    }
200    matcher->appendReplacement(output,
201                               Utf8StringToUnicodeString(replacement_string),
202                               status);
203    if (global) {
204      // Continue and look for more matches.
205      while (matcher->find()) {
206        matcher->appendReplacement(
207            output,
208            Utf8StringToUnicodeString(replacement_string),
209            status);
210      }
211    }
212
213    matcher->appendTail(output);
214    if (U_FAILURE(status)) {
215      return false;
216    }
217    const string replaced_string = UnicodeStringToUtf8String(output);
218    *string_to_process = replaced_string;
219    return true;
220  }
221
222 private:
223  scoped_ptr<RegexPattern> utf8_regexp_;
224
225  DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
226};
227
228RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
229  return new IcuRegExpInput(utf8_input);
230}
231
232RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
233  return new IcuRegExp(utf8_regexp);
234}
235
236}  // namespace phonenumbers
237}  // namespace i18n
238