unicode-inl.h revision b8a8cc1952d61a2f3a2568848933943a543b5d3e
1e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen// Copyright 2007-2010 the V8 project authors. All rights reserved. 2e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen// Use of this source code is governed by a BSD-style license that can be 3e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen// found in the LICENSE file. 4e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen 5e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#ifndef V8_UNICODE_INL_H_ 6e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#define V8_UNICODE_INL_H_ 7e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen 8e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#include "src/unicode.h" 9e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#include "src/base/logging.h" 10e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#include "src/utils.h" 11e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen 12e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chennamespace unibrow { 13e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen 14e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chentemplate <class T, int s> bool Predicate<T, s>::get(uchar code_point) { 15e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen CacheEntry entry = entries_[code_point & kMask]; 16e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen if (entry.code_point_ == code_point) return entry.value_; 17e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park return CalculateValue(code_point); 18e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park} 19e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park 20e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chentemplate <class T, int s> bool Predicate<T, s>::CalculateValue( 21280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv uchar code_point) { 229aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv bool result = T::Is(code_point); 239aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv entries_[code_point & kMask] = CacheEntry(code_point, result); 249aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv return result; 25a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup} 269aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv 27a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstruptemplate <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, 289aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv uchar* result) { 29c3571b24025ff81d49c6e71c79f7a47269fc1c5fRakesh Iyer CacheEntry entry = entries_[c & kMask]; 309aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv if (entry.code_point_ == c) { 315997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh if (entry.offset_ == 0) { 325997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh return 0; 335997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh } else { 349aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv result[0] = c + entry.offset_; 359aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv return 1; 369aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv } 37a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup } else { 38a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup return CalculateValue(c, n, result); 39a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup } 40e13d8ef364d2e2226562f7e27c9ae353502ba113Jason Tholstrup} 41a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup 42a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstruptemplate <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, 43a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup uchar* result) { 44a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup bool allow_caching = true; 45a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup int length = T::Convert(c, n, result, &allow_caching); 469aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv if (allow_caching) { 479aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv if (length == 1) { 489aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv entries_[c & kMask] = CacheEntry(c, result[0] - c); 499aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv return 1; 50a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup } else { 519aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv entries_[c & kMask] = CacheEntry(c, 0); 529aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv return 0; 53280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv } 54a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer } else { 55280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv return length; 56280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv } 57e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park} 585997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh 59e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen 60e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Parkuint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { 61e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park DCHECK(c > Latin1::kMaxChar); 6212ebb3398f19075c33c6482ecce4c0c3b6033dcbYao Chen switch (c) { 63e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park // This are equivalent characters in unicode. 646d5847b1cf70efb8d70cd6704459ccc88f561925Rakesh Iyer case 0x39c: 65e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park case 0x3bc: 66e13d8ef364d2e2226562f7e27c9ae353502ba113Jason Tholstrup return 0xb5; 67e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park // This is an uppercase of a Latin-1 character 68e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen // outside of Latin-1. 69280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv case 0x178: 70280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv return 0xff; 71280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv } 72280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv return 0; 73a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup} 74280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv 75280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv 769d4b05da918f8c4414c9c57686c9fa802d6eec13Vitalii Tomkivunsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 77a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup static const int kMask = ~(1 << 6); 78280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv if (c <= kMaxOneByteChar) { 79e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen str[0] = c; 80a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup return 1; 81a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer } 82a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer str[0] = 0xC0 | (c >> 6); 83a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer str[1] = 0x80 | (c & kMask); 84a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer return 2; 85a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer} 86a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer 87a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// Encode encodes the UTF-16 code units c and previous into the given str 88a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// buffer, and combines surrogate code units into single code points. If 89a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// replace_invalid is set to true, orphan surrogate code units will be replaced 90a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// with kBadChar. 913388e7848f3a30029935463afafe9b8280939127Keun-young Parkunsigned Utf8::Encode(char* str, 92a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer uchar c, 93a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer int previous, 94a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer bool replace_invalid) { 95a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer static const int kMask = ~(1 << 6); 96a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer if (c <= kMaxOneByteChar) { 97a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer str[0] = c; 98a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer return 1; 99a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer } else if (c <= kMaxTwoByteChar) { 100a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer str[0] = 0xC0 | (c >> 6); 1019688038518ab6bb23841d94b68b9597122b4a279Scott Main str[1] = 0x80 | (c & kMask); 102a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer return 2; 103a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer } else if (c <= kMaxThreeByteChar) { 104a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer if (Utf16::IsSurrogatePair(previous, c)) { 105a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 106e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park return Encode(str - kUnmatchedSize, 107 Utf16::CombineSurrogatePair(previous, c), 108 Utf16::kNoPreviousCharacter, 109 replace_invalid) - kUnmatchedSize; 110 } else if (replace_invalid && 111 (Utf16::IsLeadSurrogate(c) || 112 Utf16::IsTrailSurrogate(c))) { 113 c = kBadChar; 114 } 115 str[0] = 0xE0 | (c >> 12); 116 str[1] = 0x80 | ((c >> 6) & kMask); 117 str[2] = 0x80 | (c & kMask); 118 return 3; 119 } else { 120 str[0] = 0xF0 | (c >> 18); 121 str[1] = 0x80 | ((c >> 12) & kMask); 122 str[2] = 0x80 | ((c >> 6) & kMask); 123 str[3] = 0x80 | (c & kMask); 124 return 4; 125 } 126} 127 128 129uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { 130 if (length <= 0) return kBadChar; 131 byte first = bytes[0]; 132 // Characters between 0000 and 0007F are encoded as a single character 133 if (first <= kMaxOneByteChar) { 134 *cursor += 1; 135 return first; 136 } 137 return CalculateValue(bytes, length, cursor); 138} 139 140unsigned Utf8::Length(uchar c, int previous) { 141 if (c <= kMaxOneByteChar) { 142 return 1; 143 } else if (c <= kMaxTwoByteChar) { 144 return 2; 145 } else if (c <= kMaxThreeByteChar) { 146 if (Utf16::IsTrailSurrogate(c) && 147 Utf16::IsLeadSurrogate(previous)) { 148 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 149 } 150 return 3; 151 } else { 152 return 4; 153 } 154} 155 156Utf8DecoderBase::Utf8DecoderBase() 157 : unbuffered_start_(NULL), 158 utf16_length_(0), 159 last_byte_of_buffer_unused_(false) {} 160 161Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, 162 unsigned buffer_length, 163 const uint8_t* stream, 164 unsigned stream_length) { 165 Reset(buffer, buffer_length, stream, stream_length); 166} 167 168template<unsigned kBufferSize> 169Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) 170 : Utf8DecoderBase(buffer_, 171 kBufferSize, 172 reinterpret_cast<const uint8_t*>(stream), 173 length) { 174} 175 176template<unsigned kBufferSize> 177void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { 178 Utf8DecoderBase::Reset(buffer_, 179 kBufferSize, 180 reinterpret_cast<const uint8_t*>(stream), 181 length); 182} 183 184template <unsigned kBufferSize> 185unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, 186 unsigned length) const { 187 DCHECK(length > 0); 188 if (length > utf16_length_) length = utf16_length_; 189 // memcpy everything in buffer. 190 unsigned buffer_length = 191 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; 192 unsigned memcpy_length = length <= buffer_length ? length : buffer_length; 193 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); 194 if (length <= buffer_length) return length; 195 DCHECK(unbuffered_start_ != NULL); 196 // Copy the rest the slow way. 197 WriteUtf16Slow(unbuffered_start_, 198 data + buffer_length, 199 length - buffer_length); 200 return length; 201} 202 203} // namespace unibrow 204 205#endif // V8_UNICODE_INL_H_ 206