1// Copyright 2007-2010 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef V8_UNICODE_INL_H_ 6#define V8_UNICODE_INL_H_ 7 8#include "src/unicode.h" 9#include "src/base/logging.h" 10#include "src/utils.h" 11 12namespace unibrow { 13 14template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { 15 CacheEntry entry = entries_[code_point & kMask]; 16 if (entry.code_point_ == code_point) return entry.value_; 17 return CalculateValue(code_point); 18} 19 20template <class T, int s> bool Predicate<T, s>::CalculateValue( 21 uchar code_point) { 22 bool result = T::Is(code_point); 23 entries_[code_point & kMask] = CacheEntry(code_point, result); 24 return result; 25} 26 27template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, 28 uchar* result) { 29 CacheEntry entry = entries_[c & kMask]; 30 if (entry.code_point_ == c) { 31 if (entry.offset_ == 0) { 32 return 0; 33 } else { 34 result[0] = c + entry.offset_; 35 return 1; 36 } 37 } else { 38 return CalculateValue(c, n, result); 39 } 40} 41 42template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, 43 uchar* result) { 44 bool allow_caching = true; 45 int length = T::Convert(c, n, result, &allow_caching); 46 if (allow_caching) { 47 if (length == 1) { 48 entries_[c & kMask] = CacheEntry(c, result[0] - c); 49 return 1; 50 } else { 51 entries_[c & kMask] = CacheEntry(c, 0); 52 return 0; 53 } 54 } else { 55 return length; 56 } 57} 58 59 60uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { 61 DCHECK(c > Latin1::kMaxChar); 62 switch (c) { 63 // This are equivalent characters in unicode. 64 case 0x39c: 65 case 0x3bc: 66 return 0xb5; 67 // This is an uppercase of a Latin-1 character 68 // outside of Latin-1. 69 case 0x178: 70 return 0xff; 71 } 72 return 0; 73} 74 75 76unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 77 static const int kMask = ~(1 << 6); 78 if (c <= kMaxOneByteChar) { 79 str[0] = c; 80 return 1; 81 } 82 str[0] = 0xC0 | (c >> 6); 83 str[1] = 0x80 | (c & kMask); 84 return 2; 85} 86 87// Encode encodes the UTF-16 code units c and previous into the given str 88// buffer, and combines surrogate code units into single code points. If 89// replace_invalid is set to true, orphan surrogate code units will be replaced 90// with kBadChar. 91unsigned Utf8::Encode(char* str, 92 uchar c, 93 int previous, 94 bool replace_invalid) { 95 static const int kMask = ~(1 << 6); 96 if (c <= kMaxOneByteChar) { 97 str[0] = c; 98 return 1; 99 } else if (c <= kMaxTwoByteChar) { 100 str[0] = 0xC0 | (c >> 6); 101 str[1] = 0x80 | (c & kMask); 102 return 2; 103 } else if (c <= kMaxThreeByteChar) { 104 if (Utf16::IsSurrogatePair(previous, c)) { 105 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 106 return Encode(str - kUnmatchedSize, 107 Utf16::CombineSurrogatePair(previous, c), 108 Utf16::kNoPreviousCharacter, 109 replace_invalid) - kUnmatchedSize; 110 } else if (replace_invalid && 111 (Utf16::IsLeadSurrogate(c) || 112 Utf16::IsTrailSurrogate(c))) { 113 c = kBadChar; 114 } 115 str[0] = 0xE0 | (c >> 12); 116 str[1] = 0x80 | ((c >> 6) & kMask); 117 str[2] = 0x80 | (c & kMask); 118 return 3; 119 } else { 120 str[0] = 0xF0 | (c >> 18); 121 str[1] = 0x80 | ((c >> 12) & kMask); 122 str[2] = 0x80 | ((c >> 6) & kMask); 123 str[3] = 0x80 | (c & kMask); 124 return 4; 125 } 126} 127 128 129uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { 130 if (length <= 0) return kBadChar; 131 byte first = bytes[0]; 132 // Characters between 0000 and 0007F are encoded as a single character 133 if (first <= kMaxOneByteChar) { 134 *cursor += 1; 135 return first; 136 } 137 return CalculateValue(bytes, length, cursor); 138} 139 140unsigned Utf8::Length(uchar c, int previous) { 141 if (c <= kMaxOneByteChar) { 142 return 1; 143 } else if (c <= kMaxTwoByteChar) { 144 return 2; 145 } else if (c <= kMaxThreeByteChar) { 146 if (Utf16::IsTrailSurrogate(c) && 147 Utf16::IsLeadSurrogate(previous)) { 148 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 149 } 150 return 3; 151 } else { 152 return 4; 153 } 154} 155 156Utf8DecoderBase::Utf8DecoderBase() 157 : unbuffered_start_(NULL), 158 utf16_length_(0), 159 last_byte_of_buffer_unused_(false) {} 160 161Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, 162 unsigned buffer_length, 163 const uint8_t* stream, 164 unsigned stream_length) { 165 Reset(buffer, buffer_length, stream, stream_length); 166} 167 168template<unsigned kBufferSize> 169Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) 170 : Utf8DecoderBase(buffer_, 171 kBufferSize, 172 reinterpret_cast<const uint8_t*>(stream), 173 length) { 174} 175 176template<unsigned kBufferSize> 177void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { 178 Utf8DecoderBase::Reset(buffer_, 179 kBufferSize, 180 reinterpret_cast<const uint8_t*>(stream), 181 length); 182} 183 184template <unsigned kBufferSize> 185unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, 186 unsigned length) const { 187 DCHECK(length > 0); 188 if (length > utf16_length_) length = utf16_length_; 189 // memcpy everything in buffer. 190 unsigned buffer_length = 191 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; 192 unsigned memcpy_length = length <= buffer_length ? length : buffer_length; 193 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); 194 if (length <= buffer_length) return length; 195 DCHECK(unbuffered_start_ != NULL); 196 // Copy the rest the slow way. 197 WriteUtf16Slow(unbuffered_start_, 198 data + buffer_length, 199 length - buffer_length); 200 return length; 201} 202 203} // namespace unibrow 204 205#endif // V8_UNICODE_INL_H_ 206