121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka/*
221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * Copyright (C) 2017 The Android Open Source Project
321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka *
421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * Licensed under the Apache License, Version 2.0 (the "License");
521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * you may not use this file except in compliance with the License.
621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * You may obtain a copy of the License at
721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka *
821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka *      http://www.apache.org/licenses/LICENSE-2.0
921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka *
1021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * Unless required by applicable law or agreed to in writing, software
1121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * distributed under the License is distributed on an "AS IS" BASIS,
1221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * See the License for the specific language governing permissions and
1421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * limitations under the License.
1521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka */
1621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
1721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka#include "util/utf8/unilib-icu.h"
1821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
19b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka#include <utility>
2021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
2121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkanamespace libtextclassifier2 {
2221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
23b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkabool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
24b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode status = U_ZERO_ERROR;
25b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UNumberFormat* format_alias =
26b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka      unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status);
27b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(status)) {
28b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return false;
29b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
30b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8(
31b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka      icu::StringPiece(text.data(), text.size_bytes()));
32b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  int parse_index = 0;
33b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(),
34b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka                                   utf8_string.length(), &parse_index, &status);
35b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  *result = integer;
36b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  unum_close(format_alias);
37b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(status) || parse_index != utf8_string.length()) {
38b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return false;
39b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
40b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  return true;
41b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
42b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
4321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsOpeningBracket(char32 codepoint) const {
4421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
4521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka         U_BPT_OPEN;
4621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
4721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
4821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsClosingBracket(char32 codepoint) const {
4921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
5021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka         U_BPT_CLOSE;
5121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
5221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
5321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsWhitespace(char32 codepoint) const {
5421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  return u_isWhitespace(codepoint);
5521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
5621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
5721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); }
5821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
5921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); }
6021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
6121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkachar32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); }
6221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
6321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkachar32 UniLib::GetPairedBracket(char32 codepoint) const {
6421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  return u_getBidiPairedBracket(codepoint);
6521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
6621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
67b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas ZilkaUniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern,
68b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka                                   icu::UnicodeString text)
69e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka    : text_(std::move(text)),
70e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka      last_find_offset_(0),
71e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka      last_find_offset_codepoints_(0),
72e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka      last_find_offset_dirty_(true) {
73b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode status = U_ZERO_ERROR;
74b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  matcher_.reset(pattern->matcher(text_, status));
75b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(status)) {
76b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    matcher_.reset(nullptr);
77b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
78b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
79b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
80b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkastd::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
81b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    const UnicodeText& input) const {
82b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher(
83b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka      pattern_.get(), icu::UnicodeString::fromUTF8(
84b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka                          icu::StringPiece(input.data(), input.size_bytes()))));
85b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
86b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
87b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaconstexpr int UniLib::RegexMatcher::kError;
88b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaconstexpr int UniLib::RegexMatcher::kNoError;
89b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
90b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkabool UniLib::RegexMatcher::Matches(int* status) const {
91b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (!matcher_) {
92b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
93b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return false;
94b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
95ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka
96b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode icu_status = U_ZERO_ERROR;
97b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  const bool result = matcher_->matches(/*startIndex=*/0, icu_status);
98b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(icu_status)) {
99b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
10021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka    return false;
10121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  }
102b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  *status = kNoError;
103b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  return result;
104b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
10521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
106ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkabool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
107b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (!matcher_) {
108b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
10921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka    return false;
11021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  }
111ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka
112ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  matcher_->reset();
113ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  *status = kNoError;
114ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  if (!Find(status) || *status != kNoError) {
115b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return false;
116b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
117ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  const int found_start = Start(status);
118ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  if (*status != kNoError) {
119ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka    return false;
120ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  }
121ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  const int found_end = End(status);
122ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  if (*status != kNoError) {
123ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka    return false;
124ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  }
125ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  if (found_start != 0 || found_end != text_.countChar32()) {
126ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka    return false;
127ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  }
128ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  return true;
129b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
130b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
131e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilkabool UniLib::RegexMatcher::UpdateLastFindOffset() const {
132e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  if (!last_find_offset_dirty_) {
133e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka    return true;
134e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  }
135e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
136e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  // Update the position of the match.
137e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  UErrorCode icu_status = U_ZERO_ERROR;
138e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  const int find_offset = matcher_->start(0, icu_status);
139e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  if (U_FAILURE(icu_status)) {
140e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka    return false;
141e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  }
142e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  last_find_offset_codepoints_ +=
143e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka      text_.countChar32(last_find_offset_, find_offset - last_find_offset_);
144e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  last_find_offset_ = find_offset;
145e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  last_find_offset_dirty_ = false;
146e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
147e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  return true;
148e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka}
149e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
150ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkabool UniLib::RegexMatcher::Find(int* status) {
151b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (!matcher_) {
152b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
153ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka    return false;
154b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
155b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode icu_status = U_ZERO_ERROR;
156ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  const bool result = matcher_->find(icu_status);
157b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(icu_status)) {
158b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
159ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka    return false;
160b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
161e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
162e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  last_find_offset_dirty_ = true;
163b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  *status = kNoError;
164b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  return result;
165b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
166b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
167ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkaint UniLib::RegexMatcher::Start(int* status) const {
168ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  return Start(/*group_idx=*/0, status);
169ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka}
170ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka
171b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaint UniLib::RegexMatcher::Start(int group_idx, int* status) const {
172e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  if (!matcher_ || !UpdateLastFindOffset()) {
173b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
174b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return kError;
175b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
176e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
177b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode icu_status = U_ZERO_ERROR;
178b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  const int result = matcher_->start(group_idx, icu_status);
179b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(icu_status)) {
180b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
181b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return kError;
182b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
183b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  *status = kNoError;
18421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
185e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  // If the group didn't participate in the match the result is -1 and is
186e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  // incompatible with the caching logic bellow.
187e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  if (result == -1) {
188e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka    return -1;
189b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
190e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
191e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  return last_find_offset_codepoints_ +
192e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka         text_.countChar32(/*start=*/last_find_offset_,
193e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka                           /*length=*/result - last_find_offset_);
194ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka}
195ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka
196ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkaint UniLib::RegexMatcher::End(int* status) const {
197ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  return End(/*group_idx=*/0, status);
19821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
19921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
200b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaint UniLib::RegexMatcher::End(int group_idx, int* status) const {
201e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  if (!matcher_ || !UpdateLastFindOffset()) {
202b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
203b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return kError;
204b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
205b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode icu_status = U_ZERO_ERROR;
206b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  const int result = matcher_->end(group_idx, icu_status);
207b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(icu_status)) {
208b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
209b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return kError;
210b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
211b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  *status = kNoError;
212b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
213e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  // If the group didn't participate in the match the result is -1 and is
214e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  // incompatible with the caching logic bellow.
215e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  if (result == -1) {
216e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka    return -1;
217b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
218e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka
219e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka  return last_find_offset_codepoints_ +
220e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka         text_.countChar32(/*start=*/last_find_offset_,
221e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka                           /*length=*/result - last_find_offset_);
222ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka}
223ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka
224ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas ZilkaUnicodeText UniLib::RegexMatcher::Group(int* status) const {
225ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  return Group(/*group_idx=*/0, status);
226b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
227b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
228b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas ZilkaUnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
229b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (!matcher_) {
230b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
231b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return UTF8ToUnicodeText("", /*do_copy=*/false);
232b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
233b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  std::string result = "";
234b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  UErrorCode icu_status = U_ZERO_ERROR;
235ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status);
236b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  if (U_FAILURE(icu_status)) {
237b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    *status = kError;
238b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    return UTF8ToUnicodeText("", /*do_copy=*/false);
239b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  }
240ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  result_icu.toUTF8String(result);
241b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  *status = kNoError;
242b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  return UTF8ToUnicodeText(result, /*do_copy=*/true);
243b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka}
244b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka
24521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkaconstexpr int UniLib::BreakIterator::kDone;
24621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
247b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas ZilkaUniLib::BreakIterator::BreakIterator(const UnicodeText& text)
248b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    : text_(icu::UnicodeString::fromUTF8(
249ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka          icu::StringPiece(text.data(), text.size_bytes()))),
250ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka      last_break_index_(0),
251ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka      last_unicode_index_(0) {
25221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  icu::ErrorCode status;
25321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  break_iterator_.reset(
25421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka      icu::BreakIterator::createWordInstance(icu::Locale("en"), status));
25521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  if (!status.isSuccess()) {
25621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka    break_iterator_.reset();
25721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka    return;
25821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  }
259b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  break_iterator_->setText(text_);
26021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
26121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
26221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkaint UniLib::BreakIterator::Next() {
263ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  const int break_index = break_iterator_->next();
264ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  if (break_index == icu::BreakIterator::DONE) {
26521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka    return BreakIterator::kDone;
26621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  }
267ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  last_unicode_index_ +=
268ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka      text_.countChar32(last_break_index_, break_index - last_break_index_);
269ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  last_break_index_ = break_index;
270ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka  return last_unicode_index_;
27121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
27221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
27321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkastd::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
274b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    const UnicodeText& regex) const {
27521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  UErrorCode status = U_ZERO_ERROR;
276b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka  std::unique_ptr<icu::RegexPattern> pattern(
277b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka      icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece(
278b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka                                     regex.data(), regex.size_bytes())),
279b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka                                 /*flags=*/UREGEX_MULTILINE, status));
28021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  if (U_FAILURE(status) || !pattern) {
28121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka    return nullptr;
28221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  }
28321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  return std::unique_ptr<UniLib::RegexPattern>(
28421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka      new UniLib::RegexPattern(std::move(pattern)));
28521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
28621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
28721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkastd::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
288b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka    const UnicodeText& text) const {
28921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka  return std::unique_ptr<UniLib::BreakIterator>(
29021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka      new UniLib::BreakIterator(text));
29121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}
29221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka
29321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka}  // namespace libtextclassifier2
294