1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_UNICODE_H_
6#define V8_UNICODE_H_
7
8#include <sys/types.h>
9#include "src/globals.h"
10/**
11 * \file
12 * Definitions and convenience functions for working with unicode.
13 */
14
15namespace unibrow {
16
17typedef unsigned int uchar;
18typedef unsigned char byte;
19
20/**
21 * The max length of the result of converting the case of a single
22 * character.
23 */
24const int kMaxMappingSize = 4;
25
26template <class T, int size = 256>
27class Predicate {
28 public:
29  inline Predicate() { }
30  inline bool get(uchar c);
31 private:
32  friend class Test;
33  bool CalculateValue(uchar c);
34  struct CacheEntry {
35    inline CacheEntry() : code_point_(0), value_(0) { }
36    inline CacheEntry(uchar code_point, bool value)
37      : code_point_(code_point),
38        value_(value) { }
39    uchar code_point_ : 21;
40    bool value_ : 1;
41  };
42  static const int kSize = size;
43  static const int kMask = kSize - 1;
44  CacheEntry entries_[kSize];
45};
46
47// A cache used in case conversion.  It caches the value for characters
48// that either have no mapping or map to a single character independent
49// of context.  Characters that map to more than one character or that
50// map differently depending on context are always looked up.
51template <class T, int size = 256>
52class Mapping {
53 public:
54  inline Mapping() { }
55  inline int get(uchar c, uchar n, uchar* result);
56 private:
57  friend class Test;
58  int CalculateValue(uchar c, uchar n, uchar* result);
59  struct CacheEntry {
60    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
61    inline CacheEntry(uchar code_point, signed offset)
62      : code_point_(code_point),
63        offset_(offset) { }
64    uchar code_point_;
65    signed offset_;
66    static const int kNoChar = (1 << 21) - 1;
67  };
68  static const int kSize = size;
69  static const int kMask = kSize - 1;
70  CacheEntry entries_[kSize];
71};
72
73class UnicodeData {
74 private:
75  friend class Test;
76  static int GetByteCount();
77  static const uchar kMaxCodePoint;
78};
79
80class Utf16 {
81 public:
82  static inline bool IsSurrogatePair(int lead, int trail) {
83    return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
84  }
85  static inline bool IsLeadSurrogate(int code) {
86    if (code == kNoPreviousCharacter) return false;
87    return (code & 0xfc00) == 0xd800;
88  }
89  static inline bool IsTrailSurrogate(int code) {
90    if (code == kNoPreviousCharacter) return false;
91    return (code & 0xfc00) == 0xdc00;
92  }
93
94  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
95    return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
96  }
97  static const int kNoPreviousCharacter = -1;
98  static const uchar kMaxNonSurrogateCharCode = 0xffff;
99  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
100  // of UTF-8 data.  The special case where the unit is a surrogate
101  // trail produces 1 byte net, because the encoding of the pair is
102  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
103  // can be reclaimed.
104  static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
105  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
106  // The illegality stems from the surrogate not being part of a pair.
107  static const int kUtf8BytesToCodeASurrogate = 3;
108  static inline uint16_t LeadSurrogate(uint32_t char_code) {
109    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
110  }
111  static inline uint16_t TrailSurrogate(uint32_t char_code) {
112    return 0xdc00 + (char_code & 0x3ff);
113  }
114};
115
116class Latin1 {
117 public:
118  static const unsigned kMaxChar = 0xff;
119  // Returns 0 if character does not convert to single latin-1 character
120  // or if the character doesn't not convert back to latin-1 via inverse
121  // operation (upper to lower, etc).
122  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
123};
124
125class Utf8 {
126 public:
127  static inline uchar Length(uchar chr, int previous);
128  static inline unsigned EncodeOneByte(char* out, uint8_t c);
129  static inline unsigned Encode(char* out,
130                                uchar c,
131                                int previous,
132                                bool replace_invalid = false);
133  static uchar CalculateValue(const byte* str,
134                              unsigned length,
135                              unsigned* cursor);
136
137  // The unicode replacement character, used to signal invalid unicode
138  // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
139  static const uchar kBadChar = 0xFFFD;
140  static const unsigned kMaxEncodedSize   = 4;
141  static const unsigned kMaxOneByteChar   = 0x7f;
142  static const unsigned kMaxTwoByteChar   = 0x7ff;
143  static const unsigned kMaxThreeByteChar = 0xffff;
144  static const unsigned kMaxFourByteChar  = 0x1fffff;
145
146  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
147  // that match are coded as a 4 byte UTF-8 sequence.
148  static const unsigned kBytesSavedByCombiningSurrogates = 2;
149  static const unsigned kSizeOfUnmatchedSurrogate = 3;
150  // The maximum size a single UTF-16 code unit may take up when encoded as
151  // UTF-8.
152  static const unsigned kMax16BitCodeUnitSize  = 3;
153  static inline uchar ValueOf(const byte* str,
154                              unsigned length,
155                              unsigned* cursor);
156};
157
158
159class Utf8DecoderBase {
160 public:
161  // Initialization done in subclass.
162  inline Utf8DecoderBase();
163  inline Utf8DecoderBase(uint16_t* buffer,
164                         unsigned buffer_length,
165                         const uint8_t* stream,
166                         unsigned stream_length);
167  inline unsigned Utf16Length() const { return utf16_length_; }
168 protected:
169  // This reads all characters and sets the utf16_length_.
170  // The first buffer_length utf16 chars are cached in the buffer.
171  void Reset(uint16_t* buffer,
172             unsigned buffer_length,
173             const uint8_t* stream,
174             unsigned stream_length);
175  static void WriteUtf16Slow(const uint8_t* stream,
176                             uint16_t* data,
177                             unsigned length);
178  const uint8_t* unbuffered_start_;
179  unsigned utf16_length_;
180  bool last_byte_of_buffer_unused_;
181 private:
182  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
183};
184
185template <unsigned kBufferSize>
186class Utf8Decoder : public Utf8DecoderBase {
187 public:
188  inline Utf8Decoder() {}
189  inline Utf8Decoder(const char* stream, unsigned length);
190  inline void Reset(const char* stream, unsigned length);
191  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
192 private:
193  uint16_t buffer_[kBufferSize];
194};
195
196
197struct Uppercase {
198  static bool Is(uchar c);
199};
200struct Lowercase {
201  static bool Is(uchar c);
202};
203struct Letter {
204  static bool Is(uchar c);
205};
206struct Number {
207  static bool Is(uchar c);
208};
209struct WhiteSpace {
210  static bool Is(uchar c);
211};
212struct LineTerminator {
213  static bool Is(uchar c);
214};
215struct CombiningMark {
216  static bool Is(uchar c);
217};
218struct ConnectorPunctuation {
219  static bool Is(uchar c);
220};
221struct ToLowercase {
222  static const int kMaxWidth = 3;
223  static const bool kIsToLower = true;
224  static int Convert(uchar c,
225                     uchar n,
226                     uchar* result,
227                     bool* allow_caching_ptr);
228};
229struct ToUppercase {
230  static const int kMaxWidth = 3;
231  static const bool kIsToLower = false;
232  static int Convert(uchar c,
233                     uchar n,
234                     uchar* result,
235                     bool* allow_caching_ptr);
236};
237struct Ecma262Canonicalize {
238  static const int kMaxWidth = 1;
239  static int Convert(uchar c,
240                     uchar n,
241                     uchar* result,
242                     bool* allow_caching_ptr);
243};
244struct Ecma262UnCanonicalize {
245  static const int kMaxWidth = 4;
246  static int Convert(uchar c,
247                     uchar n,
248                     uchar* result,
249                     bool* allow_caching_ptr);
250};
251struct CanonicalizationRange {
252  static const int kMaxWidth = 1;
253  static int Convert(uchar c,
254                     uchar n,
255                     uchar* result,
256                     bool* allow_caching_ptr);
257};
258
259}  // namespace unibrow
260
261#endif  // V8_UNICODE_H_
262