1bb769b257e753aafcbd96767abb2abc645eaa20cBen Murdoch// Copyright 2007-2010 the V8 project authors. All rights reserved. 2b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// found in the LICENSE file. 4a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 5a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#ifndef V8_UNICODE_INL_H_ 6a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#define V8_UNICODE_INL_H_ 7a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 8b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "src/unicode.h" 9b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "src/base/logging.h" 10b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "src/utils.h" 11a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 12a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocknamespace unibrow { 13a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 14a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> bool Predicate<T, s>::get(uchar code_point) { 15a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block CacheEntry entry = entries_[code_point & kMask]; 16a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (entry.code_point_ == code_point) return entry.value_; 17a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return CalculateValue(code_point); 18a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 19a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 20a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> bool Predicate<T, s>::CalculateValue( 21a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar code_point) { 22a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool result = T::Is(code_point); 23a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block entries_[code_point & kMask] = CacheEntry(code_point, result); 24a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return result; 25a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 26a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 27a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, 28a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result) { 29a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block CacheEntry entry = entries_[c & kMask]; 30a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (entry.code_point_ == c) { 31a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (entry.offset_ == 0) { 32a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 0; 33a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else { 34a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block result[0] = c + entry.offset_; 35a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 1; 36a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 37a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else { 38a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return CalculateValue(c, n, result); 39a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 40a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 41a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 42a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, 43a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result) { 44a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool allow_caching = true; 45a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block int length = T::Convert(c, n, result, &allow_caching); 46a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (allow_caching) { 47a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (length == 1) { 48a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block entries_[c & kMask] = CacheEntry(c, result[0] - c); 49a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 1; 50a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else { 51a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block entries_[c & kMask] = CacheEntry(c, 0); 52a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 0; 53a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 54a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else { 55a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return length; 56a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 57a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 58a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 59a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 60b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochuint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { 61b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK(c > Latin1::kMaxChar); 62b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch switch (c) { 63b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch // This are equivalent characters in unicode. 64b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch case 0x39c: 65b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch case 0x3bc: 66b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return 0xb5; 67b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch // This is an uppercase of a Latin-1 character 68b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch // outside of Latin-1. 69b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch case 0x178: 70b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return 0xff; 71b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 72b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return 0; 73b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 74b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 75b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 76b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochunsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 77b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch static const int kMask = ~(1 << 6); 78b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (c <= kMaxOneByteChar) { 79b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch str[0] = c; 80b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return 1; 81b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 82b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch str[0] = 0xC0 | (c >> 6); 83b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch str[1] = 0x80 | (c & kMask); 84b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return 2; 85b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 86b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 87b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// Encode encodes the UTF-16 code units c and previous into the given str 88b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// buffer, and combines surrogate code units into single code points. If 89b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// replace_invalid is set to true, orphan surrogate code units will be replaced 90b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// with kBadChar. 91b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochunsigned Utf8::Encode(char* str, 92b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch uchar c, 93b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch int previous, 94b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch bool replace_invalid) { 95a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMask = ~(1 << 6); 96a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (c <= kMaxOneByteChar) { 97a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[0] = c; 98a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 1; 99a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else if (c <= kMaxTwoByteChar) { 100a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[0] = 0xC0 | (c >> 6); 101a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[1] = 0x80 | (c & kMask); 102a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 2; 103a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else if (c <= kMaxThreeByteChar) { 104b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (Utf16::IsSurrogatePair(previous, c)) { 1053ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 1063ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return Encode(str - kUnmatchedSize, 1073ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch Utf16::CombineSurrogatePair(previous, c), 108b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch Utf16::kNoPreviousCharacter, 109b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch replace_invalid) - kUnmatchedSize; 110b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } else if (replace_invalid && 111b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch (Utf16::IsLeadSurrogate(c) || 112b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch Utf16::IsTrailSurrogate(c))) { 113b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch c = kBadChar; 1143ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 115a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[0] = 0xE0 | (c >> 12); 116a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[1] = 0x80 | ((c >> 6) & kMask); 117a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[2] = 0x80 | (c & kMask); 118a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 3; 119a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else { 120a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[0] = 0xF0 | (c >> 18); 121a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[1] = 0x80 | ((c >> 12) & kMask); 122a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[2] = 0x80 | ((c >> 6) & kMask); 123a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block str[3] = 0x80 | (c & kMask); 124a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 4; 125a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 126a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 127a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 128a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 129a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockuchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) { 130a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (length <= 0) return kBadChar; 131a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block byte first = bytes[0]; 132a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // Characters between 0000 and 0007F are encoded as a single character 133a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (first <= kMaxOneByteChar) { 134a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block *cursor += 1; 135a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return first; 136a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 137a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return CalculateValue(bytes, length, cursor); 138a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 139a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 1403ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochunsigned Utf8::Length(uchar c, int previous) { 141a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block if (c <= kMaxOneByteChar) { 142a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 1; 143a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else if (c <= kMaxTwoByteChar) { 144a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 2; 145a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else if (c <= kMaxThreeByteChar) { 1463ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch if (Utf16::IsTrailSurrogate(c) && 1473ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch Utf16::IsLeadSurrogate(previous)) { 1483ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 1493ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 150a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 3; 151a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } else { 152a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block return 4; 153a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 154a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 155a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 156b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUtf8DecoderBase::Utf8DecoderBase() 157b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch : unbuffered_start_(NULL), 158b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch utf16_length_(0), 159b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch last_byte_of_buffer_unused_(false) {} 160b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 161b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUtf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, 162b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch unsigned buffer_length, 163b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch const uint8_t* stream, 164b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch unsigned stream_length) { 165b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch Reset(buffer, buffer_length, stream, stream_length); 166b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 167b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 168b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochtemplate<unsigned kBufferSize> 169b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUtf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) 170b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch : Utf8DecoderBase(buffer_, 171b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch kBufferSize, 172b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch reinterpret_cast<const uint8_t*>(stream), 173b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch length) { 174b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 175b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 176b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochtemplate<unsigned kBufferSize> 177b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochvoid Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { 178b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch Utf8DecoderBase::Reset(buffer_, 179b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch kBufferSize, 180b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch reinterpret_cast<const uint8_t*>(stream), 181b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch length); 182b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 183b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 184b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochtemplate <unsigned kBufferSize> 185b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochunsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, 186b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch unsigned length) const { 187b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK(length > 0); 188b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (length > utf16_length_) length = utf16_length_; 189b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch // memcpy everything in buffer. 190b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch unsigned buffer_length = 191b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; 192b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch unsigned memcpy_length = length <= buffer_length ? length : buffer_length; 193b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); 194b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (length <= buffer_length) return length; 195b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK(unbuffered_start_ != NULL); 196b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch // Copy the rest the slow way. 197b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch WriteUtf16Slow(unbuffered_start_, 198b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch data + buffer_length, 199b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch length - buffer_length); 200b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return length; 201a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} 202a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 203a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} // namespace unibrow 204a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 205a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#endif // V8_UNICODE_INL_H_ 206