1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "util/utf8/unilib-icu.h"
18
19#include <utility>
20
21namespace libtextclassifier2 {
22
23bool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
24  UErrorCode status = U_ZERO_ERROR;
25  UNumberFormat* format_alias =
26      unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status);
27  if (U_FAILURE(status)) {
28    return false;
29  }
30  icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8(
31      icu::StringPiece(text.data(), text.size_bytes()));
32  int parse_index = 0;
33  const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(),
34                                   utf8_string.length(), &parse_index, &status);
35  *result = integer;
36  unum_close(format_alias);
37  if (U_FAILURE(status) || parse_index != utf8_string.length()) {
38    return false;
39  }
40  return true;
41}
42
43bool UniLib::IsOpeningBracket(char32 codepoint) const {
44  return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
45         U_BPT_OPEN;
46}
47
48bool UniLib::IsClosingBracket(char32 codepoint) const {
49  return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
50         U_BPT_CLOSE;
51}
52
53bool UniLib::IsWhitespace(char32 codepoint) const {
54  return u_isWhitespace(codepoint);
55}
56
57bool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); }
58
59bool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); }
60
61char32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); }
62
63char32 UniLib::GetPairedBracket(char32 codepoint) const {
64  return u_getBidiPairedBracket(codepoint);
65}
66
67UniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern,
68                                   icu::UnicodeString text)
69    : text_(std::move(text)),
70      last_find_offset_(0),
71      last_find_offset_codepoints_(0),
72      last_find_offset_dirty_(true) {
73  UErrorCode status = U_ZERO_ERROR;
74  matcher_.reset(pattern->matcher(text_, status));
75  if (U_FAILURE(status)) {
76    matcher_.reset(nullptr);
77  }
78}
79
80std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
81    const UnicodeText& input) const {
82  return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher(
83      pattern_.get(), icu::UnicodeString::fromUTF8(
84                          icu::StringPiece(input.data(), input.size_bytes()))));
85}
86
87constexpr int UniLib::RegexMatcher::kError;
88constexpr int UniLib::RegexMatcher::kNoError;
89
90bool UniLib::RegexMatcher::Matches(int* status) const {
91  if (!matcher_) {
92    *status = kError;
93    return false;
94  }
95
96  UErrorCode icu_status = U_ZERO_ERROR;
97  const bool result = matcher_->matches(/*startIndex=*/0, icu_status);
98  if (U_FAILURE(icu_status)) {
99    *status = kError;
100    return false;
101  }
102  *status = kNoError;
103  return result;
104}
105
106bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
107  if (!matcher_) {
108    *status = kError;
109    return false;
110  }
111
112  matcher_->reset();
113  *status = kNoError;
114  if (!Find(status) || *status != kNoError) {
115    return false;
116  }
117  const int found_start = Start(status);
118  if (*status != kNoError) {
119    return false;
120  }
121  const int found_end = End(status);
122  if (*status != kNoError) {
123    return false;
124  }
125  if (found_start != 0 || found_end != text_.countChar32()) {
126    return false;
127  }
128  return true;
129}
130
131bool UniLib::RegexMatcher::UpdateLastFindOffset() const {
132  if (!last_find_offset_dirty_) {
133    return true;
134  }
135
136  // Update the position of the match.
137  UErrorCode icu_status = U_ZERO_ERROR;
138  const int find_offset = matcher_->start(0, icu_status);
139  if (U_FAILURE(icu_status)) {
140    return false;
141  }
142  last_find_offset_codepoints_ +=
143      text_.countChar32(last_find_offset_, find_offset - last_find_offset_);
144  last_find_offset_ = find_offset;
145  last_find_offset_dirty_ = false;
146
147  return true;
148}
149
150bool UniLib::RegexMatcher::Find(int* status) {
151  if (!matcher_) {
152    *status = kError;
153    return false;
154  }
155  UErrorCode icu_status = U_ZERO_ERROR;
156  const bool result = matcher_->find(icu_status);
157  if (U_FAILURE(icu_status)) {
158    *status = kError;
159    return false;
160  }
161
162  last_find_offset_dirty_ = true;
163  *status = kNoError;
164  return result;
165}
166
167int UniLib::RegexMatcher::Start(int* status) const {
168  return Start(/*group_idx=*/0, status);
169}
170
171int UniLib::RegexMatcher::Start(int group_idx, int* status) const {
172  if (!matcher_ || !UpdateLastFindOffset()) {
173    *status = kError;
174    return kError;
175  }
176
177  UErrorCode icu_status = U_ZERO_ERROR;
178  const int result = matcher_->start(group_idx, icu_status);
179  if (U_FAILURE(icu_status)) {
180    *status = kError;
181    return kError;
182  }
183  *status = kNoError;
184
185  // If the group didn't participate in the match the result is -1 and is
186  // incompatible with the caching logic bellow.
187  if (result == -1) {
188    return -1;
189  }
190
191  return last_find_offset_codepoints_ +
192         text_.countChar32(/*start=*/last_find_offset_,
193                           /*length=*/result - last_find_offset_);
194}
195
196int UniLib::RegexMatcher::End(int* status) const {
197  return End(/*group_idx=*/0, status);
198}
199
200int UniLib::RegexMatcher::End(int group_idx, int* status) const {
201  if (!matcher_ || !UpdateLastFindOffset()) {
202    *status = kError;
203    return kError;
204  }
205  UErrorCode icu_status = U_ZERO_ERROR;
206  const int result = matcher_->end(group_idx, icu_status);
207  if (U_FAILURE(icu_status)) {
208    *status = kError;
209    return kError;
210  }
211  *status = kNoError;
212
213  // If the group didn't participate in the match the result is -1 and is
214  // incompatible with the caching logic bellow.
215  if (result == -1) {
216    return -1;
217  }
218
219  return last_find_offset_codepoints_ +
220         text_.countChar32(/*start=*/last_find_offset_,
221                           /*length=*/result - last_find_offset_);
222}
223
224UnicodeText UniLib::RegexMatcher::Group(int* status) const {
225  return Group(/*group_idx=*/0, status);
226}
227
228UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
229  if (!matcher_) {
230    *status = kError;
231    return UTF8ToUnicodeText("", /*do_copy=*/false);
232  }
233  std::string result = "";
234  UErrorCode icu_status = U_ZERO_ERROR;
235  const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status);
236  if (U_FAILURE(icu_status)) {
237    *status = kError;
238    return UTF8ToUnicodeText("", /*do_copy=*/false);
239  }
240  result_icu.toUTF8String(result);
241  *status = kNoError;
242  return UTF8ToUnicodeText(result, /*do_copy=*/true);
243}
244
245constexpr int UniLib::BreakIterator::kDone;
246
247UniLib::BreakIterator::BreakIterator(const UnicodeText& text)
248    : text_(icu::UnicodeString::fromUTF8(
249          icu::StringPiece(text.data(), text.size_bytes()))),
250      last_break_index_(0),
251      last_unicode_index_(0) {
252  icu::ErrorCode status;
253  break_iterator_.reset(
254      icu::BreakIterator::createWordInstance(icu::Locale("en"), status));
255  if (!status.isSuccess()) {
256    break_iterator_.reset();
257    return;
258  }
259  break_iterator_->setText(text_);
260}
261
262int UniLib::BreakIterator::Next() {
263  const int break_index = break_iterator_->next();
264  if (break_index == icu::BreakIterator::DONE) {
265    return BreakIterator::kDone;
266  }
267  last_unicode_index_ +=
268      text_.countChar32(last_break_index_, break_index - last_break_index_);
269  last_break_index_ = break_index;
270  return last_unicode_index_;
271}
272
273std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
274    const UnicodeText& regex) const {
275  UErrorCode status = U_ZERO_ERROR;
276  std::unique_ptr<icu::RegexPattern> pattern(
277      icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece(
278                                     regex.data(), regex.size_bytes())),
279                                 /*flags=*/UREGEX_MULTILINE, status));
280  if (U_FAILURE(status) || !pattern) {
281    return nullptr;
282  }
283  return std::unique_ptr<UniLib::RegexPattern>(
284      new UniLib::RegexPattern(std::move(pattern)));
285}
286
287std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
288    const UnicodeText& text) const {
289  return std::unique_ptr<UniLib::BreakIterator>(
290      new UniLib::BreakIterator(text));
291}
292
293}  // namespace libtextclassifier2
294