unicode.h revision 44f0eee88ff00398ff7f715fab053374d808c90d
1// Copyright 2007-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_UNICODE_H_
29#define V8_UNICODE_H_
30
31#include <sys/types.h>
32
33/**
34 * \file
35 * Definitions and convenience functions for working with unicode.
36 */
37
38namespace unibrow {
39
40typedef unsigned int uchar;
41typedef unsigned char byte;
42
43/**
44 * The max length of the result of converting the case of a single
45 * character.
46 */
47static const int kMaxMappingSize = 4;
48
49template <class T, int size = 256>
50class Predicate {
51 public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54 private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58    inline CacheEntry() : code_point_(0), value_(0) { }
59    inline CacheEntry(uchar code_point, bool value)
60      : code_point_(code_point),
61        value_(value) { }
62    uchar code_point_ : 21;
63    bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68};
69
70// A cache used in case conversion.  It caches the value for characters
71// that either have no mapping or map to a single character independent
72// of context.  Characters that map to more than one character or that
73// map differently depending on context are always looked up.
74template <class T, int size = 256>
75class Mapping {
76 public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79 private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84    inline CacheEntry(uchar code_point, signed offset)
85      : code_point_(code_point),
86        offset_(offset) { }
87    uchar code_point_;
88    signed offset_;
89    static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94};
95
96class UnicodeData {
97 private:
98  friend class Test;
99  static int GetByteCount();
100  static const uchar kMaxCodePoint;
101};
102
103// --- U t f   8 ---
104
105template <typename Data>
106class Buffer {
107 public:
108  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109  inline Buffer() : data_(0), length_(0) { }
110  Data data() { return data_; }
111  unsigned length() { return length_; }
112 private:
113  Data data_;
114  unsigned length_;
115};
116
117class Utf8 {
118 public:
119  static inline uchar Length(uchar chr);
120  static inline unsigned Encode(char* out, uchar c);
121  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
122      unsigned capacity, unsigned* chars_read, unsigned* offset);
123  static uchar CalculateValue(const byte* str,
124                              unsigned length,
125                              unsigned* cursor);
126  static const uchar kBadChar = 0xFFFD;
127  static const unsigned kMaxEncodedSize   = 4;
128  static const unsigned kMaxOneByteChar   = 0x7f;
129  static const unsigned kMaxTwoByteChar   = 0x7ff;
130  static const unsigned kMaxThreeByteChar = 0xffff;
131  static const unsigned kMaxFourByteChar  = 0x1fffff;
132
133 private:
134  template <unsigned s> friend class Utf8InputBuffer;
135  friend class Test;
136  static inline uchar ValueOf(const byte* str,
137                              unsigned length,
138                              unsigned* cursor);
139};
140
141// --- C h a r a c t e r   S t r e a m ---
142
143class CharacterStream {
144 public:
145  inline uchar GetNext();
146  inline bool has_more() { return remaining_ != 0; }
147  // Note that default implementation is not efficient.
148  virtual void Seek(unsigned);
149  unsigned Length();
150  virtual ~CharacterStream() { }
151  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
152      unsigned& offset);
153  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
154      unsigned capacity, unsigned& offset);
155  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
156      unsigned capacity, unsigned& offset);
157  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
158  virtual void Rewind() = 0;
159 protected:
160  virtual void FillBuffer() = 0;
161  // The number of characters left in the current buffer
162  unsigned remaining_;
163  // The current offset within the buffer
164  unsigned cursor_;
165  // The buffer containing the decoded characters.
166  const byte* buffer_;
167};
168
169// --- I n p u t   B u f f e r ---
170
171/**
172 * Provides efficient access to encoded characters in strings.  It
173 * does so by reading characters one block at a time, rather than one
174 * character at a time, which gives string implementations an
175 * opportunity to optimize the decoding.
176 */
177template <class Reader, class Input = Reader*, unsigned kSize = 256>
178class InputBuffer : public CharacterStream {
179 public:
180  virtual void Rewind();
181  inline void Reset(Input input);
182  void Seek(unsigned position);
183  inline void Reset(unsigned position, Input input);
184 protected:
185  InputBuffer() { }
186  explicit InputBuffer(Input input) { Reset(input); }
187  virtual void FillBuffer();
188
189  // A custom offset that can be used by the string implementation to
190  // mark progress within the encoded string.
191  unsigned offset_;
192  // The input string
193  Input input_;
194  // To avoid heap allocation, we keep an internal buffer to which
195  // the encoded string can write its characters.  The string
196  // implementation is free to decide whether it wants to use this
197  // buffer or not.
198  byte util_buffer_[kSize];
199};
200
201// --- U t f 8   I n p u t   B u f f e r ---
202
203template <unsigned s = 256>
204class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
205 public:
206  inline Utf8InputBuffer() { }
207  inline Utf8InputBuffer(const char* data, unsigned length);
208  inline void Reset(const char* data, unsigned length) {
209    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
210        Buffer<const char*>(data, length));
211  }
212};
213
214
215struct Uppercase {
216  static bool Is(uchar c);
217};
218struct Lowercase {
219  static bool Is(uchar c);
220};
221struct Letter {
222  static bool Is(uchar c);
223};
224struct Space {
225  static bool Is(uchar c);
226};
227struct Number {
228  static bool Is(uchar c);
229};
230struct WhiteSpace {
231  static bool Is(uchar c);
232};
233struct LineTerminator {
234  static bool Is(uchar c);
235};
236struct CombiningMark {
237  static bool Is(uchar c);
238};
239struct ConnectorPunctuation {
240  static bool Is(uchar c);
241};
242struct ToLowercase {
243  static const int kMaxWidth = 3;
244  static int Convert(uchar c,
245                     uchar n,
246                     uchar* result,
247                     bool* allow_caching_ptr);
248};
249struct ToUppercase {
250  static const int kMaxWidth = 3;
251  static int Convert(uchar c,
252                     uchar n,
253                     uchar* result,
254                     bool* allow_caching_ptr);
255};
256struct Ecma262Canonicalize {
257  static const int kMaxWidth = 1;
258  static int Convert(uchar c,
259                     uchar n,
260                     uchar* result,
261                     bool* allow_caching_ptr);
262};
263struct Ecma262UnCanonicalize {
264  static const int kMaxWidth = 4;
265  static int Convert(uchar c,
266                     uchar n,
267                     uchar* result,
268                     bool* allow_caching_ptr);
269};
270struct CanonicalizationRange {
271  static const int kMaxWidth = 1;
272  static int Convert(uchar c,
273                     uchar n,
274                     uchar* result,
275                     bool* allow_caching_ptr);
276};
277
278}  // namespace unibrow
279
280#endif  // V8_UNICODE_H_
281