1// Copyright 2007-2010 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef V8_UNICODE_INL_H_ 6#define V8_UNICODE_INL_H_ 7 8#include "src/unicode.h" 9#include "src/base/logging.h" 10#include "src/utils.h" 11 12namespace unibrow { 13 14template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { 15 CacheEntry entry = entries_[code_point & kMask]; 16 if (entry.code_point() == code_point) return entry.value(); 17 return CalculateValue(code_point); 18} 19 20template <class T, int s> bool Predicate<T, s>::CalculateValue( 21 uchar code_point) { 22 bool result = T::Is(code_point); 23 entries_[code_point & kMask] = CacheEntry(code_point, result); 24 return result; 25} 26 27template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, 28 uchar* result) { 29 CacheEntry entry = entries_[c & kMask]; 30 if (entry.code_point_ == c) { 31 if (entry.offset_ == 0) { 32 return 0; 33 } else { 34 result[0] = c + entry.offset_; 35 return 1; 36 } 37 } else { 38 return CalculateValue(c, n, result); 39 } 40} 41 42template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, 43 uchar* result) { 44 bool allow_caching = true; 45 int length = T::Convert(c, n, result, &allow_caching); 46 if (allow_caching) { 47 if (length == 1) { 48 entries_[c & kMask] = CacheEntry(c, result[0] - c); 49 return 1; 50 } else { 51 entries_[c & kMask] = CacheEntry(c, 0); 52 return 0; 53 } 54 } else { 55 return length; 56 } 57} 58 59 60unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 61 static const int kMask = ~(1 << 6); 62 if (c <= kMaxOneByteChar) { 63 str[0] = c; 64 return 1; 65 } 66 str[0] = 0xC0 | (c >> 6); 67 str[1] = 0x80 | (c & kMask); 68 return 2; 69} 70 71// Encode encodes the UTF-16 code units c and previous into the given str 72// buffer, and combines surrogate code units into single code points. If 73// replace_invalid is set to true, orphan surrogate code units will be replaced 74// with kBadChar. 75unsigned Utf8::Encode(char* str, 76 uchar c, 77 int previous, 78 bool replace_invalid) { 79 static const int kMask = ~(1 << 6); 80 if (c <= kMaxOneByteChar) { 81 str[0] = c; 82 return 1; 83 } else if (c <= kMaxTwoByteChar) { 84 str[0] = 0xC0 | (c >> 6); 85 str[1] = 0x80 | (c & kMask); 86 return 2; 87 } else if (c <= kMaxThreeByteChar) { 88 if (Utf16::IsSurrogatePair(previous, c)) { 89 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 90 return Encode(str - kUnmatchedSize, 91 Utf16::CombineSurrogatePair(previous, c), 92 Utf16::kNoPreviousCharacter, 93 replace_invalid) - kUnmatchedSize; 94 } else if (replace_invalid && 95 (Utf16::IsLeadSurrogate(c) || 96 Utf16::IsTrailSurrogate(c))) { 97 c = kBadChar; 98 } 99 str[0] = 0xE0 | (c >> 12); 100 str[1] = 0x80 | ((c >> 6) & kMask); 101 str[2] = 0x80 | (c & kMask); 102 return 3; 103 } else { 104 str[0] = 0xF0 | (c >> 18); 105 str[1] = 0x80 | ((c >> 12) & kMask); 106 str[2] = 0x80 | ((c >> 6) & kMask); 107 str[3] = 0x80 | (c & kMask); 108 return 4; 109 } 110} 111 112 113uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { 114 if (length <= 0) return kBadChar; 115 byte first = bytes[0]; 116 // Characters between 0000 and 0007F are encoded as a single character 117 if (first <= kMaxOneByteChar) { 118 *cursor += 1; 119 return first; 120 } 121 return CalculateValue(bytes, length, cursor); 122} 123 124unsigned Utf8::Length(uchar c, int previous) { 125 if (c <= kMaxOneByteChar) { 126 return 1; 127 } else if (c <= kMaxTwoByteChar) { 128 return 2; 129 } else if (c <= kMaxThreeByteChar) { 130 if (Utf16::IsTrailSurrogate(c) && 131 Utf16::IsLeadSurrogate(previous)) { 132 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 133 } 134 return 3; 135 } else { 136 return 4; 137 } 138} 139 140bool Utf8::IsValidCharacter(uchar c) { 141 return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) || 142 (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu && 143 c != kBadChar); 144} 145 146} // namespace unibrow 147 148#endif // V8_UNICODE_INL_H_ 149