1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
18#define LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
19
20#include <utility>
21
22#include "base.h"
23
24namespace libtextclassifier {
25
26// ***************************** UnicodeText **************************
27//
28// A UnicodeText object is a wrapper around a sequence of Unicode
29// codepoint values that allows iteration over these values.
30//
31// The internal representation of the text is UTF-8. Since UTF-8 is a
32// variable-width format, UnicodeText does not provide random access
33// to the text, and changes to the text are permitted only at the end.
34//
35// The UnicodeText class defines a const_iterator. The dereferencing
36// operator (*) returns a codepoint (int32). The iterator is a
37// read-only iterator. It becomes invalid if the text is changed.
38//
39// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
40// 0x10FFFF], but UnicodeText has the additional restriction that it
41// can contain only those characters that are valid for interchange on
42// the Web. This excludes all of the control codes except for carriage
43// return, line feed, and horizontal tab.  It also excludes
44// non-characters, but codepoints that are in the Private Use regions
45// are allowed, as are codepoints that are unassigned. (See the
46// Unicode reference for details.)
47//
48// MEMORY MANAGEMENT:
49//
50// PointToUTF8(buffer, size) creates an alias pointing to buffer.
51//
52// The purpose of an alias is to avoid making an unnecessary copy of a
53// UTF-8 buffer while still providing access to the Unicode values
54// within that text through iterators. The lifetime of an alias must not
55// exceed the lifetime of the buffer from which it was constructed.
56//
57// Aliases should be used with care. If the source from which an alias
58// was created is freed, or if the contents are changed, while the
59// alias is still in use, fatal errors could result. But it can be
60// quite useful to have a UnicodeText "window" through which to see a
61// UTF-8 buffer without having to pay the price of making a copy.
62
63class UnicodeText {
64 public:
65  class const_iterator;
66
67  UnicodeText();  // Create an empty text.
68  UnicodeText(const UnicodeText& src);
69  ~UnicodeText();
70
71  class const_iterator {
72    typedef const_iterator CI;
73
74   public:
75    typedef std::input_iterator_tag iterator_category;
76    typedef char32 value_type;
77    typedef int difference_type;
78    typedef void pointer;  // (Not needed.)
79    typedef const char32 reference;  // (Needed for const_reverse_iterator)
80
81    // Iterators are default-constructible.
82    const_iterator();
83
84    // It's safe to make multiple passes over a UnicodeText.
85    const_iterator& operator=(const const_iterator& other);
86
87    char32 operator*() const;  // Dereference
88
89    const_iterator& operator++();  // Advance (++iter)
90    const_iterator operator++(int) {  // (iter++)
91      const_iterator result(*this);
92      ++*this;
93      return result;
94    }
95
96    const_iterator& operator--();     // Retreat (--iter)
97    const_iterator operator--(int) {  // (iter--)
98      const_iterator result(*this);
99      --*this;
100      return result;
101    }
102
103    friend bool operator==(const CI& lhs, const CI& rhs) {
104      return lhs.it_ == rhs.it_;
105    }
106    friend bool operator!=(const CI& lhs, const CI& rhs) {
107      return !(lhs == rhs);
108    }
109    friend bool operator<(const CI& lhs, const CI& rhs);
110    friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
111    friend bool operator<=(const CI& lhs, const CI& rhs) {
112      return !(rhs < lhs);
113    }
114    friend bool operator>=(const CI& lhs, const CI& rhs) {
115      return !(lhs < rhs);
116    }
117
118    int utf8_length() const {
119      if (it_[0] < 0x80) {
120        return 1;
121      } else if (it_[0] < 0xE0) {
122        return 2;
123      } else if (it_[0] < 0xF0) {
124        return 3;
125      } else {
126        return 4;
127      }
128    }
129    const char* utf8_data() const { return it_; }
130
131   private:
132    friend class UnicodeText;
133    explicit const_iterator(const char *it) : it_(it) {}
134
135    const char *it_;
136  };
137
138  const_iterator begin() const;
139  const_iterator end() const;
140
141  // x.PointToUTF8(buf,len) changes x so that it points to buf
142  // ("becomes an alias"). It does not take ownership or copy buf.
143  // This function assumes that the input is interchange valid UTF8.
144  UnicodeText& Copy(const UnicodeText& src);
145  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
146  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
147
148  // Calling this may invalidate pointers to underlying data.
149  UnicodeText& AppendUTF8(const char* utf8, int len);
150  void clear();
151
152  static std::string UTF8Substring(const const_iterator& first,
153                                   const const_iterator& last);
154
155 private:
156  friend class const_iterator;
157
158  class Repr {  // A byte-string.
159   public:
160    char* data_;
161    int size_;
162    int capacity_;
163    bool ours_;  // Do we own data_?
164
165    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
166    ~Repr() {
167      if (ours_) delete[] data_;
168    }
169
170    void clear();
171    void reserve(int capacity);
172    void resize(int size);
173
174    void append(const char* bytes, int byte_length);
175    void Copy(const char* data, int size);
176    void TakeOwnershipOf(char* data, int size, int capacity);
177    void PointTo(const char* data, int size);
178
179   private:
180    Repr& operator=(const Repr&);
181    Repr(const Repr& other);
182  };
183
184  Repr repr_;
185};
186
187typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
188    UnicodeTextRange;
189
190UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy);
191UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy);
192
193}  // namespace libtextclassifier
194
195#endif  // LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
196