121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka/* 221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * Copyright (C) 2017 The Android Open Source Project 321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * 421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * Licensed under the Apache License, Version 2.0 (the "License"); 521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * you may not use this file except in compliance with the License. 621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * You may obtain a copy of the License at 721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * 821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * http://www.apache.org/licenses/LICENSE-2.0 921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * 1021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * Unless required by applicable law or agreed to in writing, software 1121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * distributed under the License is distributed on an "AS IS" BASIS, 1221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * See the License for the specific language governing permissions and 1421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka * limitations under the License. 1521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka */ 1621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 1721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka#include "util/utf8/unilib-icu.h" 1821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 19b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka#include <utility> 2021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 2121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkanamespace libtextclassifier2 { 2221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 23b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkabool UniLib::ParseInt32(const UnicodeText& text, int* result) const { 24b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode status = U_ZERO_ERROR; 25b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UNumberFormat* format_alias = 26b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status); 27b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(status)) { 28b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return false; 29b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 30b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8( 31b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka icu::StringPiece(text.data(), text.size_bytes())); 32b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka int parse_index = 0; 33b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(), 34b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka utf8_string.length(), &parse_index, &status); 35b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *result = integer; 36b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka unum_close(format_alias); 37b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(status) || parse_index != utf8_string.length()) { 38b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return false; 39b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 40b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return true; 41b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 42b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 4321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsOpeningBracket(char32 codepoint) const { 4421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) == 4521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka U_BPT_OPEN; 4621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 4721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 4821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsClosingBracket(char32 codepoint) const { 4921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) == 5021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka U_BPT_CLOSE; 5121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 5221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 5321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsWhitespace(char32 codepoint) const { 5421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return u_isWhitespace(codepoint); 5521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 5621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 5721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); } 5821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 5921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkabool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); } 6021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 6121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkachar32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); } 6221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 6321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkachar32 UniLib::GetPairedBracket(char32 codepoint) const { 6421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return u_getBidiPairedBracket(codepoint); 6521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 6621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 67b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas ZilkaUniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern, 68b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka icu::UnicodeString text) 69e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka : text_(std::move(text)), 70e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_(0), 71e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_codepoints_(0), 72e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_dirty_(true) { 73b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode status = U_ZERO_ERROR; 74b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka matcher_.reset(pattern->matcher(text_, status)); 75b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(status)) { 76b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka matcher_.reset(nullptr); 77b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 78b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 79b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 80b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkastd::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher( 81b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const UnicodeText& input) const { 82b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher( 83b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka pattern_.get(), icu::UnicodeString::fromUTF8( 84b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka icu::StringPiece(input.data(), input.size_bytes())))); 85b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 86b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 87b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaconstexpr int UniLib::RegexMatcher::kError; 88b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaconstexpr int UniLib::RegexMatcher::kNoError; 89b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 90b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkabool UniLib::RegexMatcher::Matches(int* status) const { 91b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (!matcher_) { 92b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 93b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return false; 94b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 95ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka 96b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode icu_status = U_ZERO_ERROR; 97b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const bool result = matcher_->matches(/*startIndex=*/0, icu_status); 98b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(icu_status)) { 99b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 10021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return false; 10121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka } 102b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kNoError; 103b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return result; 104b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 10521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 106ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkabool UniLib::RegexMatcher::ApproximatelyMatches(int* status) { 107b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (!matcher_) { 108b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 10921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return false; 11021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka } 111ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka 112ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka matcher_->reset(); 113ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka *status = kNoError; 114ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka if (!Find(status) || *status != kNoError) { 115b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return false; 116b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 117ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka const int found_start = Start(status); 118ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka if (*status != kNoError) { 119ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return false; 120ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka } 121ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka const int found_end = End(status); 122ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka if (*status != kNoError) { 123ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return false; 124ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka } 125ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka if (found_start != 0 || found_end != text_.countChar32()) { 126ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return false; 127ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka } 128ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return true; 129b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 130b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 131e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilkabool UniLib::RegexMatcher::UpdateLastFindOffset() const { 132e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka if (!last_find_offset_dirty_) { 133e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return true; 134e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka } 135e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 136e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka // Update the position of the match. 137e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka UErrorCode icu_status = U_ZERO_ERROR; 138e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka const int find_offset = matcher_->start(0, icu_status); 139e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka if (U_FAILURE(icu_status)) { 140e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return false; 141e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka } 142e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_codepoints_ += 143e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka text_.countChar32(last_find_offset_, find_offset - last_find_offset_); 144e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_ = find_offset; 145e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_dirty_ = false; 146e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 147e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return true; 148e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka} 149e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 150ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkabool UniLib::RegexMatcher::Find(int* status) { 151b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (!matcher_) { 152b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 153ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return false; 154b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 155b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode icu_status = U_ZERO_ERROR; 156ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka const bool result = matcher_->find(icu_status); 157b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(icu_status)) { 158b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 159ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return false; 160b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 161e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 162e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka last_find_offset_dirty_ = true; 163b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kNoError; 164b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return result; 165b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 166b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 167ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkaint UniLib::RegexMatcher::Start(int* status) const { 168ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return Start(/*group_idx=*/0, status); 169ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka} 170ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka 171b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaint UniLib::RegexMatcher::Start(int group_idx, int* status) const { 172e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka if (!matcher_ || !UpdateLastFindOffset()) { 173b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 174b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return kError; 175b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 176e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 177b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode icu_status = U_ZERO_ERROR; 178b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const int result = matcher_->start(group_idx, icu_status); 179b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(icu_status)) { 180b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 181b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return kError; 182b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 183b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kNoError; 18421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 185e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka // If the group didn't participate in the match the result is -1 and is 186e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka // incompatible with the caching logic bellow. 187e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka if (result == -1) { 188e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return -1; 189b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 190e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 191e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return last_find_offset_codepoints_ + 192e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka text_.countChar32(/*start=*/last_find_offset_, 193e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka /*length=*/result - last_find_offset_); 194ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka} 195ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka 196ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilkaint UniLib::RegexMatcher::End(int* status) const { 197ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return End(/*group_idx=*/0, status); 19821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 19921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 200b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilkaint UniLib::RegexMatcher::End(int group_idx, int* status) const { 201e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka if (!matcher_ || !UpdateLastFindOffset()) { 202b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 203b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return kError; 204b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 205b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode icu_status = U_ZERO_ERROR; 206b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const int result = matcher_->end(group_idx, icu_status); 207b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(icu_status)) { 208b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 209b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return kError; 210b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 211b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kNoError; 212b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 213e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka // If the group didn't participate in the match the result is -1 and is 214e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka // incompatible with the caching logic bellow. 215e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka if (result == -1) { 216e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return -1; 217b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 218e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka 219e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka return last_find_offset_codepoints_ + 220e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka text_.countChar32(/*start=*/last_find_offset_, 221e7962cca83035d93ca32912c47f46a1c5a4ef016Lukas Zilka /*length=*/result - last_find_offset_); 222ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka} 223ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka 224ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas ZilkaUnicodeText UniLib::RegexMatcher::Group(int* status) const { 225ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return Group(/*group_idx=*/0, status); 226b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 227b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 228b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas ZilkaUnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const { 229b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (!matcher_) { 230b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 231b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return UTF8ToUnicodeText("", /*do_copy=*/false); 232b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 233b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka std::string result = ""; 234b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka UErrorCode icu_status = U_ZERO_ERROR; 235ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status); 236b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka if (U_FAILURE(icu_status)) { 237b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kError; 238b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return UTF8ToUnicodeText("", /*do_copy=*/false); 239b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka } 240ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka result_icu.toUTF8String(result); 241b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka *status = kNoError; 242b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka return UTF8ToUnicodeText(result, /*do_copy=*/true); 243b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka} 244b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka 24521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkaconstexpr int UniLib::BreakIterator::kDone; 24621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 247b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas ZilkaUniLib::BreakIterator::BreakIterator(const UnicodeText& text) 248b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka : text_(icu::UnicodeString::fromUTF8( 249ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka icu::StringPiece(text.data(), text.size_bytes()))), 250ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka last_break_index_(0), 251ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka last_unicode_index_(0) { 25221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka icu::ErrorCode status; 25321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka break_iterator_.reset( 25421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka icu::BreakIterator::createWordInstance(icu::Locale("en"), status)); 25521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka if (!status.isSuccess()) { 25621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka break_iterator_.reset(); 25721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return; 25821d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka } 259b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka break_iterator_->setText(text_); 26021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 26121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 26221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkaint UniLib::BreakIterator::Next() { 263ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka const int break_index = break_iterator_->next(); 264ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka if (break_index == icu::BreakIterator::DONE) { 26521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return BreakIterator::kDone; 26621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka } 267ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka last_unicode_index_ += 268ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka text_.countChar32(last_break_index_, break_index - last_break_index_); 269ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka last_break_index_ = break_index; 270ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40bLukas Zilka return last_unicode_index_; 27121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 27221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 27321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkastd::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern( 274b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const UnicodeText& regex) const { 27521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka UErrorCode status = U_ZERO_ERROR; 276b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka std::unique_ptr<icu::RegexPattern> pattern( 277b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece( 278b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka regex.data(), regex.size_bytes())), 279b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka /*flags=*/UREGEX_MULTILINE, status)); 28021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka if (U_FAILURE(status) || !pattern) { 28121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return nullptr; 28221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka } 28321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return std::unique_ptr<UniLib::RegexPattern>( 28421d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka new UniLib::RegexPattern(std::move(pattern))); 28521d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 28621d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 28721d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilkastd::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator( 288b23e2125be90bbf6124e9cd5684fc93026c5ec4dLukas Zilka const UnicodeText& text) const { 28921d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka return std::unique_ptr<UniLib::BreakIterator>( 29021d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka new UniLib::BreakIterator(text)); 29121d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} 29221d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka 29321d8c98fb12bc83dd0e9f5cb8fa9197ef325e074Lukas Zilka} // namespace libtextclassifier2 294