1/* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17// UniLib implementation with the help of ICU. UniLib is basically a wrapper 18// around the ICU functionality. 19 20#ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_ 21#define LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_ 22 23#include <memory> 24 25#include "util/base/integral_types.h" 26#include "util/utf8/unicodetext.h" 27#include "unicode/brkiter.h" 28#include "unicode/errorcode.h" 29#include "unicode/regex.h" 30#include "unicode/uchar.h" 31#include "unicode/unum.h" 32 33namespace libtextclassifier2 { 34 35class UniLib { 36 public: 37 bool ParseInt32(const UnicodeText& text, int* result) const; 38 bool IsOpeningBracket(char32 codepoint) const; 39 bool IsClosingBracket(char32 codepoint) const; 40 bool IsWhitespace(char32 codepoint) const; 41 bool IsDigit(char32 codepoint) const; 42 bool IsUpper(char32 codepoint) const; 43 44 char32 ToLower(char32 codepoint) const; 45 char32 GetPairedBracket(char32 codepoint) const; 46 47 // Forward declaration for friend. 48 class RegexPattern; 49 50 class RegexMatcher { 51 public: 52 static constexpr int kError = -1; 53 static constexpr int kNoError = 0; 54 55 // Checks whether the input text matches the pattern exactly. 56 bool Matches(int* status) const; 57 58 // Approximate Matches() implementation implemented using Find(). It uses 59 // the first Find() result and then checks that it spans the whole input. 60 // NOTE: Unlike Matches() it can result in false negatives. 61 // NOTE: Resets the matcher, so the current Find() state will be lost. 62 bool ApproximatelyMatches(int* status); 63 64 // Finds occurrences of the pattern in the input text. 65 // Can be called repeatedly to find all occurences. A call will update 66 // internal state, so that 'Start', 'End' and 'Group' can be called to get 67 // information about the match. 68 // NOTE: Any call to ApproximatelyMatches() in between Find() calls will 69 // modify the state. 70 bool Find(int* status); 71 72 // Gets the start offset of the last match (from 'Find'). 73 // Sets status to 'kError' if 'Find' 74 // was not called previously. 75 int Start(int* status) const; 76 77 // Gets the start offset of the specified group of the last match. 78 // (from 'Find'). 79 // Sets status to 'kError' if an invalid group was specified or if 'Find' 80 // was not called previously. 81 int Start(int group_idx, int* status) const; 82 83 // Gets the end offset of the last match (from 'Find'). 84 // Sets status to 'kError' if 'Find' 85 // was not called previously. 86 int End(int* status) const; 87 88 // Gets the end offset of the specified group of the last match. 89 // (from 'Find'). 90 // Sets status to 'kError' if an invalid group was specified or if 'Find' 91 // was not called previously. 92 int End(int group_idx, int* status) const; 93 94 // Gets the text of the last match (from 'Find'). 95 // Sets status to 'kError' if 'Find' was not called previously. 96 UnicodeText Group(int* status) const; 97 98 // Gets the text of the specified group of the last match (from 'Find'). 99 // Sets status to 'kError' if an invalid group was specified or if 'Find' 100 // was not called previously. 101 UnicodeText Group(int group_idx, int* status) const; 102 103 protected: 104 friend class RegexPattern; 105 explicit RegexMatcher(icu::RegexPattern* pattern, icu::UnicodeString text); 106 107 private: 108 bool UpdateLastFindOffset() const; 109 110 std::unique_ptr<icu::RegexMatcher> matcher_; 111 icu::UnicodeString text_; 112 mutable int last_find_offset_; 113 mutable int last_find_offset_codepoints_; 114 mutable bool last_find_offset_dirty_; 115 }; 116 117 class RegexPattern { 118 public: 119 std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& input) const; 120 121 protected: 122 friend class UniLib; 123 explicit RegexPattern(std::unique_ptr<icu::RegexPattern> pattern) 124 : pattern_(std::move(pattern)) {} 125 126 private: 127 std::unique_ptr<icu::RegexPattern> pattern_; 128 }; 129 130 class BreakIterator { 131 public: 132 int Next(); 133 134 static constexpr int kDone = -1; 135 136 protected: 137 friend class UniLib; 138 explicit BreakIterator(const UnicodeText& text); 139 140 private: 141 std::unique_ptr<icu::BreakIterator> break_iterator_; 142 icu::UnicodeString text_; 143 int last_break_index_; 144 int last_unicode_index_; 145 }; 146 147 std::unique_ptr<RegexPattern> CreateRegexPattern( 148 const UnicodeText& regex) const; 149 std::unique_ptr<BreakIterator> CreateBreakIterator( 150 const UnicodeText& text) const; 151}; 152 153} // namespace libtextclassifier2 154 155#endif // LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_ 156