1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
30
31#include <sys/types.h>
32
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
47const int kMaxMappingSize = 4;
48
49template <class T, int size = 256>
50class Predicate {
51 public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54 private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58    inline CacheEntry() : code_point_(0), value_(0) { }
59    inline CacheEntry(uchar code_point, bool value)
60      : code_point_(code_point),
61        value_(value) { }
62    uchar code_point_ : 21;
63    bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion.  It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context.  Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79 private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84    inline CacheEntry(uchar code_point, signed offset)
85      : code_point_(code_point),
86        offset_(offset) { }
87    uchar code_point_;
88    signed offset_;
89    static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98  friend class Test;
99  static int GetByteCount();
100  static const uchar kMaxCodePoint;
101};
102
103// --- U t f   8   a n d   16 ---
104
105template <typename Data>
106class Buffer {
107 public:
108  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109  inline Buffer() : data_(0), length_(0) { }
110  Data data() { return data_; }
111  unsigned length() { return length_; }
112 private:
113  Data data_;
114  unsigned length_;
115};
116
117
118class Utf16 {
119 public:
120  static inline bool IsLeadSurrogate(int code) {
121    if (code == kNoPreviousCharacter) return false;
122    return (code & 0xfc00) == 0xd800;
123  }
124  static inline bool IsTrailSurrogate(int code) {
125    if (code == kNoPreviousCharacter) return false;
126    return (code & 0xfc00) == 0xdc00;
127  }
128
129  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
130    return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
131  }
132  static const int kNoPreviousCharacter = -1;
133  static const uchar kMaxNonSurrogateCharCode = 0xffff;
134  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135  // of UTF-8 data.  The special case where the unit is a surrogate
136  // trail produces 1 byte net, because the encoding of the pair is
137  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138  // can be reclaimed.
139  static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
140  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141  // The illegality stems from the surrogate not being part of a pair.
142  static const int kUtf8BytesToCodeASurrogate = 3;
143  static inline uchar LeadSurrogate(int char_code) {
144    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145  }
146  static inline uchar TrailSurrogate(int char_code) {
147    return 0xdc00 + (char_code & 0x3ff);
148  }
149};
150
151
152class Utf8 {
153 public:
154  static inline uchar Length(uchar chr, int previous);
155  static inline unsigned Encode(
156      char* out, uchar c, int previous);
157  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158      unsigned capacity, unsigned* chars_read, unsigned* offset);
159  static uchar CalculateValue(const byte* str,
160                              unsigned length,
161                              unsigned* cursor);
162  static const uchar kBadChar = 0xFFFD;
163  static const unsigned kMaxEncodedSize   = 4;
164  static const unsigned kMaxOneByteChar   = 0x7f;
165  static const unsigned kMaxTwoByteChar   = 0x7ff;
166  static const unsigned kMaxThreeByteChar = 0xffff;
167  static const unsigned kMaxFourByteChar  = 0x1fffff;
168
169  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170  // that match are coded as a 4 byte UTF-8 sequence.
171  static const unsigned kBytesSavedByCombiningSurrogates = 2;
172  static const unsigned kSizeOfUnmatchedSurrogate = 3;
173
174 private:
175  template <unsigned s> friend class Utf8InputBuffer;
176  friend class Test;
177  static inline uchar ValueOf(const byte* str,
178                              unsigned length,
179                              unsigned* cursor);
180};
181
182// --- C h a r a c t e r   S t r e a m ---
183
184class CharacterStream {
185 public:
186  inline uchar GetNext();
187  inline bool has_more() { return remaining_ != 0; }
188  // Note that default implementation is not efficient.
189  virtual void Seek(unsigned);
190  unsigned Length();
191  unsigned Utf16Length();
192  virtual ~CharacterStream() { }
193  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
194      unsigned& offset);
195  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196      unsigned capacity, unsigned& offset);
197  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198      unsigned capacity, unsigned& offset);
199  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200  virtual void Rewind() = 0;
201
202 protected:
203  virtual void FillBuffer() = 0;
204  // The number of characters left in the current buffer
205  unsigned remaining_;
206  // The current offset within the buffer
207  unsigned cursor_;
208  // The buffer containing the decoded characters.
209  const byte* buffer_;
210};
211
212// --- I n p u t   B u f f e r ---
213
214/**
215 * Provides efficient access to encoded characters in strings.  It
216 * does so by reading characters one block at a time, rather than one
217 * character at a time, which gives string implementations an
218 * opportunity to optimize the decoding.
219 */
220template <class Reader, class Input = Reader*, unsigned kSize = 256>
221class InputBuffer : public CharacterStream {
222 public:
223  virtual void Rewind();
224  inline void Reset(Input input);
225  void Seek(unsigned position);
226  inline void Reset(unsigned position, Input input);
227 protected:
228  InputBuffer() { }
229  explicit InputBuffer(Input input) { Reset(input); }
230  virtual void FillBuffer();
231
232  // A custom offset that can be used by the string implementation to
233  // mark progress within the encoded string.
234  unsigned offset_;
235  // The input string
236  Input input_;
237  // To avoid heap allocation, we keep an internal buffer to which
238  // the encoded string can write its characters.  The string
239  // implementation is free to decide whether it wants to use this
240  // buffer or not.
241  byte util_buffer_[kSize];
242};
243
244// --- U t f 8   I n p u t   B u f f e r ---
245
246template <unsigned s = 256>
247class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
248 public:
249  inline Utf8InputBuffer() { }
250  inline Utf8InputBuffer(const char* data, unsigned length);
251  inline void Reset(const char* data, unsigned length) {
252    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
253        Buffer<const char*>(data, length));
254  }
255};
256
257
258struct Uppercase {
259  static bool Is(uchar c);
260};
261struct Lowercase {
262  static bool Is(uchar c);
263};
264struct Letter {
265  static bool Is(uchar c);
266};
267struct Space {
268  static bool Is(uchar c);
269};
270struct Number {
271  static bool Is(uchar c);
272};
273struct WhiteSpace {
274  static bool Is(uchar c);
275};
276struct LineTerminator {
277  static bool Is(uchar c);
278};
279struct CombiningMark {
280  static bool Is(uchar c);
281};
282struct ConnectorPunctuation {
283  static bool Is(uchar c);
284};
285struct ToLowercase {
286  static const int kMaxWidth = 3;
287  static int Convert(uchar c,
288                     uchar n,
289                     uchar* result,
290                     bool* allow_caching_ptr);
291};
292struct ToUppercase {
293  static const int kMaxWidth = 3;
294  static int Convert(uchar c,
295                     uchar n,
296                     uchar* result,
297                     bool* allow_caching_ptr);
298};
299struct Ecma262Canonicalize {
300  static const int kMaxWidth = 1;
301  static int Convert(uchar c,
302                     uchar n,
303                     uchar* result,
304                     bool* allow_caching_ptr);
305};
306struct Ecma262UnCanonicalize {
307  static const int kMaxWidth = 4;
308  static int Convert(uchar c,
309                     uchar n,
310                     uchar* result,
311                     bool* allow_caching_ptr);
312};
313struct CanonicalizationRange {
314  static const int kMaxWidth = 1;
315  static int Convert(uchar c,
316                     uchar n,
317                     uchar* result,
318                     bool* allow_caching_ptr);
319};
320
321}  // namespace unibrow
322
323#endif  // V8_UNICODE_H_
324