unicode.h revision 44f0eee88ff00398ff7f715fab053374d808c90d
1// Copyright 2007-2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#ifndef V8_UNICODE_H_ 29#define V8_UNICODE_H_ 30 31#include <sys/types.h> 32 33/** 34 * \file 35 * Definitions and convenience functions for working with unicode. 36 */ 37 38namespace unibrow { 39 40typedef unsigned int uchar; 41typedef unsigned char byte; 42 43/** 44 * The max length of the result of converting the case of a single 45 * character. 46 */ 47static const int kMaxMappingSize = 4; 48 49template <class T, int size = 256> 50class Predicate { 51 public: 52 inline Predicate() { } 53 inline bool get(uchar c); 54 private: 55 friend class Test; 56 bool CalculateValue(uchar c); 57 struct CacheEntry { 58 inline CacheEntry() : code_point_(0), value_(0) { } 59 inline CacheEntry(uchar code_point, bool value) 60 : code_point_(code_point), 61 value_(value) { } 62 uchar code_point_ : 21; 63 bool value_ : 1; 64 }; 65 static const int kSize = size; 66 static const int kMask = kSize - 1; 67 CacheEntry entries_[kSize]; 68}; 69 70// A cache used in case conversion. It caches the value for characters 71// that either have no mapping or map to a single character independent 72// of context. Characters that map to more than one character or that 73// map differently depending on context are always looked up. 74template <class T, int size = 256> 75class Mapping { 76 public: 77 inline Mapping() { } 78 inline int get(uchar c, uchar n, uchar* result); 79 private: 80 friend class Test; 81 int CalculateValue(uchar c, uchar n, uchar* result); 82 struct CacheEntry { 83 inline CacheEntry() : code_point_(kNoChar), offset_(0) { } 84 inline CacheEntry(uchar code_point, signed offset) 85 : code_point_(code_point), 86 offset_(offset) { } 87 uchar code_point_; 88 signed offset_; 89 static const int kNoChar = (1 << 21) - 1; 90 }; 91 static const int kSize = size; 92 static const int kMask = kSize - 1; 93 CacheEntry entries_[kSize]; 94}; 95 96class UnicodeData { 97 private: 98 friend class Test; 99 static int GetByteCount(); 100 static const uchar kMaxCodePoint; 101}; 102 103// --- U t f 8 --- 104 105template <typename Data> 106class Buffer { 107 public: 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } 109 inline Buffer() : data_(0), length_(0) { } 110 Data data() { return data_; } 111 unsigned length() { return length_; } 112 private: 113 Data data_; 114 unsigned length_; 115}; 116 117class Utf8 { 118 public: 119 static inline uchar Length(uchar chr); 120 static inline unsigned Encode(char* out, uchar c); 121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, 122 unsigned capacity, unsigned* chars_read, unsigned* offset); 123 static uchar CalculateValue(const byte* str, 124 unsigned length, 125 unsigned* cursor); 126 static const uchar kBadChar = 0xFFFD; 127 static const unsigned kMaxEncodedSize = 4; 128 static const unsigned kMaxOneByteChar = 0x7f; 129 static const unsigned kMaxTwoByteChar = 0x7ff; 130 static const unsigned kMaxThreeByteChar = 0xffff; 131 static const unsigned kMaxFourByteChar = 0x1fffff; 132 133 private: 134 template <unsigned s> friend class Utf8InputBuffer; 135 friend class Test; 136 static inline uchar ValueOf(const byte* str, 137 unsigned length, 138 unsigned* cursor); 139}; 140 141// --- C h a r a c t e r S t r e a m --- 142 143class CharacterStream { 144 public: 145 inline uchar GetNext(); 146 inline bool has_more() { return remaining_ != 0; } 147 // Note that default implementation is not efficient. 148 virtual void Seek(unsigned); 149 unsigned Length(); 150 virtual ~CharacterStream() { } 151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, 152 unsigned& offset); 153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, 154 unsigned capacity, unsigned& offset); 155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, 156 unsigned capacity, unsigned& offset); 157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); 158 virtual void Rewind() = 0; 159 protected: 160 virtual void FillBuffer() = 0; 161 // The number of characters left in the current buffer 162 unsigned remaining_; 163 // The current offset within the buffer 164 unsigned cursor_; 165 // The buffer containing the decoded characters. 166 const byte* buffer_; 167}; 168 169// --- I n p u t B u f f e r --- 170 171/** 172 * Provides efficient access to encoded characters in strings. It 173 * does so by reading characters one block at a time, rather than one 174 * character at a time, which gives string implementations an 175 * opportunity to optimize the decoding. 176 */ 177template <class Reader, class Input = Reader*, unsigned kSize = 256> 178class InputBuffer : public CharacterStream { 179 public: 180 virtual void Rewind(); 181 inline void Reset(Input input); 182 void Seek(unsigned position); 183 inline void Reset(unsigned position, Input input); 184 protected: 185 InputBuffer() { } 186 explicit InputBuffer(Input input) { Reset(input); } 187 virtual void FillBuffer(); 188 189 // A custom offset that can be used by the string implementation to 190 // mark progress within the encoded string. 191 unsigned offset_; 192 // The input string 193 Input input_; 194 // To avoid heap allocation, we keep an internal buffer to which 195 // the encoded string can write its characters. The string 196 // implementation is free to decide whether it wants to use this 197 // buffer or not. 198 byte util_buffer_[kSize]; 199}; 200 201// --- U t f 8 I n p u t B u f f e r --- 202 203template <unsigned s = 256> 204class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> { 205 public: 206 inline Utf8InputBuffer() { } 207 inline Utf8InputBuffer(const char* data, unsigned length); 208 inline void Reset(const char* data, unsigned length) { 209 InputBuffer<Utf8, Buffer<const char*>, s>::Reset( 210 Buffer<const char*>(data, length)); 211 } 212}; 213 214 215struct Uppercase { 216 static bool Is(uchar c); 217}; 218struct Lowercase { 219 static bool Is(uchar c); 220}; 221struct Letter { 222 static bool Is(uchar c); 223}; 224struct Space { 225 static bool Is(uchar c); 226}; 227struct Number { 228 static bool Is(uchar c); 229}; 230struct WhiteSpace { 231 static bool Is(uchar c); 232}; 233struct LineTerminator { 234 static bool Is(uchar c); 235}; 236struct CombiningMark { 237 static bool Is(uchar c); 238}; 239struct ConnectorPunctuation { 240 static bool Is(uchar c); 241}; 242struct ToLowercase { 243 static const int kMaxWidth = 3; 244 static int Convert(uchar c, 245 uchar n, 246 uchar* result, 247 bool* allow_caching_ptr); 248}; 249struct ToUppercase { 250 static const int kMaxWidth = 3; 251 static int Convert(uchar c, 252 uchar n, 253 uchar* result, 254 bool* allow_caching_ptr); 255}; 256struct Ecma262Canonicalize { 257 static const int kMaxWidth = 1; 258 static int Convert(uchar c, 259 uchar n, 260 uchar* result, 261 bool* allow_caching_ptr); 262}; 263struct Ecma262UnCanonicalize { 264 static const int kMaxWidth = 4; 265 static int Convert(uchar c, 266 uchar n, 267 uchar* result, 268 bool* allow_caching_ptr); 269}; 270struct CanonicalizationRange { 271 static const int kMaxWidth = 1; 272 static int Convert(uchar c, 273 uchar n, 274 uchar* result, 275 bool* allow_caching_ptr); 276}; 277 278} // namespace unibrow 279 280#endif // V8_UNICODE_H_ 281