1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17// UniLib implementation with the help of ICU. UniLib is basically a wrapper
18// around the ICU functionality.
19
20#ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_
21#define LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_
22
23#include <memory>
24
25#include "util/base/integral_types.h"
26#include "util/utf8/unicodetext.h"
27#include "unicode/brkiter.h"
28#include "unicode/errorcode.h"
29#include "unicode/regex.h"
30#include "unicode/uchar.h"
31#include "unicode/unum.h"
32
33namespace libtextclassifier2 {
34
35class UniLib {
36 public:
37  bool ParseInt32(const UnicodeText& text, int* result) const;
38  bool IsOpeningBracket(char32 codepoint) const;
39  bool IsClosingBracket(char32 codepoint) const;
40  bool IsWhitespace(char32 codepoint) const;
41  bool IsDigit(char32 codepoint) const;
42  bool IsUpper(char32 codepoint) const;
43
44  char32 ToLower(char32 codepoint) const;
45  char32 GetPairedBracket(char32 codepoint) const;
46
47  // Forward declaration for friend.
48  class RegexPattern;
49
50  class RegexMatcher {
51   public:
52    static constexpr int kError = -1;
53    static constexpr int kNoError = 0;
54
55    // Checks whether the input text matches the pattern exactly.
56    bool Matches(int* status) const;
57
58    // Approximate Matches() implementation implemented using Find(). It uses
59    // the first Find() result and then checks that it spans the whole input.
60    // NOTE: Unlike Matches() it can result in false negatives.
61    // NOTE: Resets the matcher, so the current Find() state will be lost.
62    bool ApproximatelyMatches(int* status);
63
64    // Finds occurrences of the pattern in the input text.
65    // Can be called repeatedly to find all occurences. A call will update
66    // internal state, so that 'Start', 'End' and 'Group' can be called to get
67    // information about the match.
68    // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
69    // modify the state.
70    bool Find(int* status);
71
72    // Gets the start offset of the last match (from  'Find').
73    // Sets status to 'kError' if 'Find'
74    // was not called previously.
75    int Start(int* status) const;
76
77    // Gets the start offset of the specified group of the last match.
78    // (from  'Find').
79    // Sets status to 'kError' if an invalid group was specified or if 'Find'
80    // was not called previously.
81    int Start(int group_idx, int* status) const;
82
83    // Gets the end offset of the last match (from  'Find').
84    // Sets status to 'kError' if 'Find'
85    // was not called previously.
86    int End(int* status) const;
87
88    // Gets the end offset of the specified group of the last match.
89    // (from  'Find').
90    // Sets status to 'kError' if an invalid group was specified or if 'Find'
91    // was not called previously.
92    int End(int group_idx, int* status) const;
93
94    // Gets the text of the last match (from 'Find').
95    // Sets status to 'kError' if 'Find' was not called previously.
96    UnicodeText Group(int* status) const;
97
98    // Gets the text of the specified group of the last match (from 'Find').
99    // Sets status to 'kError' if an invalid group was specified or if 'Find'
100    // was not called previously.
101    UnicodeText Group(int group_idx, int* status) const;
102
103   protected:
104    friend class RegexPattern;
105    explicit RegexMatcher(icu::RegexPattern* pattern, icu::UnicodeString text);
106
107   private:
108    bool UpdateLastFindOffset() const;
109
110    std::unique_ptr<icu::RegexMatcher> matcher_;
111    icu::UnicodeString text_;
112    mutable int last_find_offset_;
113    mutable int last_find_offset_codepoints_;
114    mutable bool last_find_offset_dirty_;
115  };
116
117  class RegexPattern {
118   public:
119    std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& input) const;
120
121   protected:
122    friend class UniLib;
123    explicit RegexPattern(std::unique_ptr<icu::RegexPattern> pattern)
124        : pattern_(std::move(pattern)) {}
125
126   private:
127    std::unique_ptr<icu::RegexPattern> pattern_;
128  };
129
130  class BreakIterator {
131   public:
132    int Next();
133
134    static constexpr int kDone = -1;
135
136   protected:
137    friend class UniLib;
138    explicit BreakIterator(const UnicodeText& text);
139
140   private:
141    std::unique_ptr<icu::BreakIterator> break_iterator_;
142    icu::UnicodeString text_;
143    int last_break_index_;
144    int last_unicode_index_;
145  };
146
147  std::unique_ptr<RegexPattern> CreateRegexPattern(
148      const UnicodeText& regex) const;
149  std::unique_ptr<BreakIterator> CreateBreakIterator(
150      const UnicodeText& text) const;
151};
152
153}  // namespace libtextclassifier2
154
155#endif  // LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_
156