1// Copyright 2007-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
30
31#include <sys/types.h>
32
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
47static const int kMaxMappingSize = 4;
48
49template <class T, int size = 256>
50class Predicate {
51 public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54 private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58    inline CacheEntry() : code_point_(0), value_(0) { }
59    inline CacheEntry(uchar code_point, bool value)
60      : code_point_(code_point),
61        value_(value) { }
62    uchar code_point_ : 21;
63    bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion.  It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context.  Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79 private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84    inline CacheEntry(uchar code_point, signed offset)
85      : code_point_(code_point),
86        offset_(offset) { }
87    uchar code_point_;
88    signed offset_;
89    static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98  friend class Test;
99  static int GetByteCount();
100  static uchar kMaxCodePoint;
101};
102
103// --- U t f   8 ---
104
105template <typename Data>
106class Buffer {
107 public:
108  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109  inline Buffer() : data_(0), length_(0) { }
110  Data data() { return data_; }
111  unsigned length() { return length_; }
112 private:
113  Data data_;
114  unsigned length_;
115};
116
117class Utf8 {
118 public:
119  static inline uchar Length(uchar chr);
120  static inline unsigned Encode(char* out, uchar c);
121  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122      unsigned capacity, unsigned* chars_read, unsigned* offset);
123  static const uchar kBadChar = 0xFFFD;
124  static const unsigned kMaxEncodedSize   = 4;
125  static const unsigned kMaxOneByteChar   = 0x7f;
126  static const unsigned kMaxTwoByteChar   = 0x7ff;
127  static const unsigned kMaxThreeByteChar = 0xffff;
128  static const unsigned kMaxFourByteChar  = 0x1fffff;
129
130 private:
131  template <unsigned s> friend class Utf8InputBuffer;
132  friend class Test;
133  static inline uchar ValueOf(const byte* str,
134                              unsigned length,
135                              unsigned* cursor);
136  static uchar CalculateValue(const byte* str,
137                              unsigned length,
138                              unsigned* cursor);
139};
140
141// --- C h a r a c t e r   S t r e a m ---
142
143class CharacterStream {
144 public:
145  inline uchar GetNext();
146  inline bool has_more() { return remaining_ != 0; }
147  // Note that default implementation is not efficient.
148  virtual void Seek(unsigned);
149  unsigned Length();
150  virtual ~CharacterStream() { }
151  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152      unsigned& offset);
153  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154      unsigned capacity, unsigned& offset);
155  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156      unsigned capacity, unsigned& offset);
157  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158  virtual void Rewind() = 0;
159 protected:
160  virtual void FillBuffer() = 0;
161  // The number of characters left in the current buffer
162  unsigned remaining_;
163  // The current offset within the buffer
164  unsigned cursor_;
165  // The buffer containing the decoded characters.
166  const byte* buffer_;
167};
168
169// --- I n p u t   B u f f e r ---
170
171/**
172 * Provides efficient access to encoded characters in strings.  It
173 * does so by reading characters one block at a time, rather than one
174 * character at a time, which gives string implementations an
175 * opportunity to optimize the decoding.
176 */
177template <class Reader, class Input = Reader*, unsigned kSize = 256>
178class InputBuffer : public CharacterStream {
179 public:
180  virtual void Rewind();
181  inline void Reset(Input input);
182  void Seek(unsigned position);
183  inline void Reset(unsigned position, Input input);
184 protected:
185  InputBuffer() { }
186  explicit InputBuffer(Input input) { Reset(input); }
187  virtual void FillBuffer();
188
189  // A custom offset that can be used by the string implementation to
190  // mark progress within the encoded string.
191  unsigned offset_;
192  // The input string
193  Input input_;
194  // To avoid heap allocation, we keep an internal buffer to which
195  // the encoded string can write its characters.  The string
196  // implementation is free to decide whether it wants to use this
197  // buffer or not.
198  byte util_buffer_[kSize];
199};
200
201// --- U t f 8   I n p u t   B u f f e r ---
202
203template <unsigned s = 256>
204class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
205 public:
206  inline Utf8InputBuffer() { }
207  inline Utf8InputBuffer(const char* data, unsigned length);
208  inline void Reset(const char* data, unsigned length) {
209    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
210        Buffer<const char*>(data, length));
211  }
212};
213
214struct Uppercase {
215  static bool Is(uchar c);
216};
217struct Lowercase {
218  static bool Is(uchar c);
219};
220struct Letter {
221  static bool Is(uchar c);
222};
223struct Space {
224  static bool Is(uchar c);
225};
226struct Number {
227  static bool Is(uchar c);
228};
229struct WhiteSpace {
230  static bool Is(uchar c);
231};
232struct LineTerminator {
233  static bool Is(uchar c);
234};
235struct CombiningMark {
236  static bool Is(uchar c);
237};
238struct ConnectorPunctuation {
239  static bool Is(uchar c);
240};
241struct ToLowercase {
242  static const int kMaxWidth = 3;
243  static int Convert(uchar c,
244                     uchar n,
245                     uchar* result,
246                     bool* allow_caching_ptr);
247};
248struct ToUppercase {
249  static const int kMaxWidth = 3;
250  static int Convert(uchar c,
251                     uchar n,
252                     uchar* result,
253                     bool* allow_caching_ptr);
254};
255struct Ecma262Canonicalize {
256  static const int kMaxWidth = 1;
257  static int Convert(uchar c,
258                     uchar n,
259                     uchar* result,
260                     bool* allow_caching_ptr);
261};
262struct Ecma262UnCanonicalize {
263  static const int kMaxWidth = 4;
264  static int Convert(uchar c,
265                     uchar n,
266                     uchar* result,
267                     bool* allow_caching_ptr);
268};
269struct CanonicalizationRange {
270  static const int kMaxWidth = 1;
271  static int Convert(uchar c,
272                     uchar n,
273                     uchar* result,
274                     bool* allow_caching_ptr);
275};
276
277}  // namespace unibrow
278
279#endif  // V8_UNICODE_H_
280