1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
30
31#include <sys/types.h>
32#include <globals.h>
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
47const int kMaxMappingSize = 4;
48
49template <class T, int size = 256>
50class Predicate {
51 public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54 private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58    inline CacheEntry() : code_point_(0), value_(0) { }
59    inline CacheEntry(uchar code_point, bool value)
60      : code_point_(code_point),
61        value_(value) { }
62    uchar code_point_ : 21;
63    bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion.  It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context.  Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79 private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84    inline CacheEntry(uchar code_point, signed offset)
85      : code_point_(code_point),
86        offset_(offset) { }
87    uchar code_point_;
88    signed offset_;
89    static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98  friend class Test;
99  static int GetByteCount();
100  static const uchar kMaxCodePoint;
101};
102
103class Utf16 {
104 public:
105  static inline bool IsLeadSurrogate(int code) {
106    if (code == kNoPreviousCharacter) return false;
107    return (code & 0xfc00) == 0xd800;
108  }
109  static inline bool IsTrailSurrogate(int code) {
110    if (code == kNoPreviousCharacter) return false;
111    return (code & 0xfc00) == 0xdc00;
112  }
113
114  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
115    return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
116  }
117  static const int kNoPreviousCharacter = -1;
118  static const uchar kMaxNonSurrogateCharCode = 0xffff;
119  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
120  // of UTF-8 data.  The special case where the unit is a surrogate
121  // trail produces 1 byte net, because the encoding of the pair is
122  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
123  // can be reclaimed.
124  static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
125  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
126  // The illegality stems from the surrogate not being part of a pair.
127  static const int kUtf8BytesToCodeASurrogate = 3;
128  static inline uint16_t LeadSurrogate(uint32_t char_code) {
129    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
130  }
131  static inline uint16_t TrailSurrogate(uint32_t char_code) {
132    return 0xdc00 + (char_code & 0x3ff);
133  }
134};
135
136class Latin1 {
137 public:
138  static const unsigned kMaxChar = 0xff;
139  // Returns 0 if character does not convert to single latin-1 character
140  // or if the character doesn't not convert back to latin-1 via inverse
141  // operation (upper to lower, etc).
142  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
143};
144
145class Utf8 {
146 public:
147  static inline uchar Length(uchar chr, int previous);
148  static inline unsigned EncodeOneByte(char* out, uint8_t c);
149  static inline unsigned Encode(
150      char* out, uchar c, int previous);
151  static uchar CalculateValue(const byte* str,
152                              unsigned length,
153                              unsigned* cursor);
154  static const uchar kBadChar = 0xFFFD;
155  static const unsigned kMaxEncodedSize   = 4;
156  static const unsigned kMaxOneByteChar   = 0x7f;
157  static const unsigned kMaxTwoByteChar   = 0x7ff;
158  static const unsigned kMaxThreeByteChar = 0xffff;
159  static const unsigned kMaxFourByteChar  = 0x1fffff;
160
161  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
162  // that match are coded as a 4 byte UTF-8 sequence.
163  static const unsigned kBytesSavedByCombiningSurrogates = 2;
164  static const unsigned kSizeOfUnmatchedSurrogate = 3;
165  static inline uchar ValueOf(const byte* str,
166                              unsigned length,
167                              unsigned* cursor);
168};
169
170
171class Utf8DecoderBase {
172 public:
173  // Initialization done in subclass.
174  inline Utf8DecoderBase();
175  inline Utf8DecoderBase(uint16_t* buffer,
176                         unsigned buffer_length,
177                         const uint8_t* stream,
178                         unsigned stream_length);
179  inline unsigned Utf16Length() const { return utf16_length_; }
180 protected:
181  // This reads all characters and sets the utf16_length_.
182  // The first buffer_length utf16 chars are cached in the buffer.
183  void Reset(uint16_t* buffer,
184             unsigned buffer_length,
185             const uint8_t* stream,
186             unsigned stream_length);
187  static void WriteUtf16Slow(const uint8_t* stream,
188                             uint16_t* data,
189                             unsigned length);
190  const uint8_t* unbuffered_start_;
191  unsigned utf16_length_;
192  bool last_byte_of_buffer_unused_;
193 private:
194  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
195};
196
197template <unsigned kBufferSize>
198class Utf8Decoder : public Utf8DecoderBase {
199 public:
200  inline Utf8Decoder() {}
201  inline Utf8Decoder(const char* stream, unsigned length);
202  inline void Reset(const char* stream, unsigned length);
203  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
204 private:
205  uint16_t buffer_[kBufferSize];
206};
207
208
209struct Uppercase {
210  static bool Is(uchar c);
211};
212struct Lowercase {
213  static bool Is(uchar c);
214};
215struct Letter {
216  static bool Is(uchar c);
217};
218struct Space {
219  static bool Is(uchar c);
220};
221struct Number {
222  static bool Is(uchar c);
223};
224struct WhiteSpace {
225  static bool Is(uchar c);
226};
227struct LineTerminator {
228  static bool Is(uchar c);
229};
230struct CombiningMark {
231  static bool Is(uchar c);
232};
233struct ConnectorPunctuation {
234  static bool Is(uchar c);
235};
236struct ToLowercase {
237  static const int kMaxWidth = 3;
238  static int Convert(uchar c,
239                     uchar n,
240                     uchar* result,
241                     bool* allow_caching_ptr);
242};
243struct ToUppercase {
244  static const int kMaxWidth = 3;
245  static int Convert(uchar c,
246                     uchar n,
247                     uchar* result,
248                     bool* allow_caching_ptr);
249};
250struct Ecma262Canonicalize {
251  static const int kMaxWidth = 1;
252  static int Convert(uchar c,
253                     uchar n,
254                     uchar* result,
255                     bool* allow_caching_ptr);
256};
257struct Ecma262UnCanonicalize {
258  static const int kMaxWidth = 4;
259  static int Convert(uchar c,
260                     uchar n,
261                     uchar* result,
262                     bool* allow_caching_ptr);
263};
264struct CanonicalizationRange {
265  static const int kMaxWidth = 1;
266  static int Convert(uchar c,
267                     uchar n,
268                     uchar* result,
269                     bool* allow_caching_ptr);
270};
271
272}  // namespace unibrow
273
274#endif  // V8_UNICODE_H_
275