1bb769b257e753aafcbd96767abb2abc645eaa20cBen Murdoch// Copyright 2007-2010 the V8 project authors. All rights reserved.
2b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// Use of this source code is governed by a BSD-style license that can be
3b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// found in the LICENSE file.
4a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
5a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#ifndef V8_UNICODE_INL_H_
6a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#define V8_UNICODE_INL_H_
7a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
8b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "src/unicode.h"
9b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "src/base/logging.h"
10b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "src/utils.h"
11a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
12a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocknamespace unibrow {
13a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
14a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
15a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  CacheEntry entry = entries_[code_point & kMask];
16a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (entry.code_point_ == code_point) return entry.value_;
17a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  return CalculateValue(code_point);
18a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
19a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
20a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> bool Predicate<T, s>::CalculateValue(
21a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    uchar code_point) {
22a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  bool result = T::Is(code_point);
23a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  entries_[code_point & kMask] = CacheEntry(code_point, result);
24a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  return result;
25a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
26a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
27a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
28a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    uchar* result) {
29a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  CacheEntry entry = entries_[c & kMask];
30a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (entry.code_point_ == c) {
31a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    if (entry.offset_ == 0) {
32a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      return 0;
33a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    } else {
34a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      result[0] = c + entry.offset_;
35a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      return 1;
36a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    }
37a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else {
38a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return CalculateValue(c, n, result);
39a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  }
40a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
41a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
42a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
43a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    uchar* result) {
44a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  bool allow_caching = true;
45a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  int length = T::Convert(c, n, result, &allow_caching);
46a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (allow_caching) {
47a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    if (length == 1) {
48a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      entries_[c & kMask] = CacheEntry(c, result[0] - c);
49a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      return 1;
50a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    } else {
51a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      entries_[c & kMask] = CacheEntry(c, 0);
52a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      return 0;
53a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    }
54a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else {
55a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return length;
56a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  }
57a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
58a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
59a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
60b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochuint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
61b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  DCHECK(c > Latin1::kMaxChar);
62b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  switch (c) {
63b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    // This are equivalent characters in unicode.
64b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    case 0x39c:
65b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    case 0x3bc:
66b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch      return 0xb5;
67b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    // This is an uppercase of a Latin-1 character
68b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    // outside of Latin-1.
69b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    case 0x178:
70b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch      return 0xff;
71b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  }
72b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  return 0;
73b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch}
74b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
75b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
76b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochunsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
77b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  static const int kMask = ~(1 << 6);
78b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  if (c <= kMaxOneByteChar) {
79b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    str[0] = c;
80b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    return 1;
81b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  }
82b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  str[0] = 0xC0 | (c >> 6);
83b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  str[1] = 0x80 | (c & kMask);
84b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  return 2;
85b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch}
86b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
87b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// Encode encodes the UTF-16 code units c and previous into the given str
88b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// buffer, and combines surrogate code units into single code points. If
89b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// replace_invalid is set to true, orphan surrogate code units will be replaced
90b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// with kBadChar.
91b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochunsigned Utf8::Encode(char* str,
92b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                      uchar c,
93b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                      int previous,
94b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                      bool replace_invalid) {
95a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMask = ~(1 << 6);
96a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (c <= kMaxOneByteChar) {
97a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[0] = c;
98a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 1;
99a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else if (c <= kMaxTwoByteChar) {
100a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[0] = 0xC0 | (c >> 6);
101a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[1] = 0x80 | (c & kMask);
102a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 2;
103a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else if (c <= kMaxThreeByteChar) {
104b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    if (Utf16::IsSurrogatePair(previous, c)) {
1053ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
1063ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch      return Encode(str - kUnmatchedSize,
1073ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch                    Utf16::CombineSurrogatePair(previous, c),
108b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                    Utf16::kNoPreviousCharacter,
109b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                    replace_invalid) - kUnmatchedSize;
110b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    } else if (replace_invalid &&
111b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch               (Utf16::IsLeadSurrogate(c) ||
112b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch               Utf16::IsTrailSurrogate(c))) {
113b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch      c = kBadChar;
1143ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    }
115a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[0] = 0xE0 | (c >> 12);
116a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[1] = 0x80 | ((c >> 6) & kMask);
117a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[2] = 0x80 | (c & kMask);
118a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 3;
119a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else {
120a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[0] = 0xF0 | (c >> 18);
121a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[1] = 0x80 | ((c >> 12) & kMask);
122a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[2] = 0x80 | ((c >> 6) & kMask);
123a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    str[3] = 0x80 | (c & kMask);
124a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 4;
125a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  }
126a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
127a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
128a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
129a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockuchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
130a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (length <= 0) return kBadChar;
131a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  byte first = bytes[0];
132a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // Characters between 0000 and 0007F are encoded as a single character
133a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (first <= kMaxOneByteChar) {
134a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    *cursor += 1;
135a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return first;
136a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  }
137a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  return CalculateValue(bytes, length, cursor);
138a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
139a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
1403ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochunsigned Utf8::Length(uchar c, int previous) {
141a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  if (c <= kMaxOneByteChar) {
142a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 1;
143a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else if (c <= kMaxTwoByteChar) {
144a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 2;
145a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else if (c <= kMaxThreeByteChar) {
1463ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    if (Utf16::IsTrailSurrogate(c) &&
1473ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch        Utf16::IsLeadSurrogate(previous)) {
1483ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
1493ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    }
150a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 3;
151a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  } else {
152a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    return 4;
153a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  }
154a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
155a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
156b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUtf8DecoderBase::Utf8DecoderBase()
157b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  : unbuffered_start_(NULL),
158b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    utf16_length_(0),
159b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch    last_byte_of_buffer_unused_(false) {}
160b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
161b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUtf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
162b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                                 unsigned buffer_length,
163b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                                 const uint8_t* stream,
164b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                                 unsigned stream_length) {
165b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  Reset(buffer, buffer_length, stream, stream_length);
166b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch}
167b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
168b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochtemplate<unsigned kBufferSize>
169b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUtf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
170b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  : Utf8DecoderBase(buffer_,
171b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                    kBufferSize,
172b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                    reinterpret_cast<const uint8_t*>(stream),
173b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                    length) {
174b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch}
175b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
176b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochtemplate<unsigned kBufferSize>
177b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochvoid Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
178b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  Utf8DecoderBase::Reset(buffer_,
179b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                         kBufferSize,
180b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                         reinterpret_cast<const uint8_t*>(stream),
181b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                         length);
182b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch}
183b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch
184b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochtemplate <unsigned kBufferSize>
185b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdochunsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
186b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                                              unsigned length) const {
187b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  DCHECK(length > 0);
188b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  if (length > utf16_length_) length = utf16_length_;
189b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  // memcpy everything in buffer.
190b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  unsigned buffer_length =
191b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
192b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
193b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
194b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  if (length <= buffer_length) return length;
195b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  DCHECK(unbuffered_start_ != NULL);
196b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  // Copy the rest the slow way.
197b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  WriteUtf16Slow(unbuffered_start_,
198b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                 data + buffer_length,
199b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch                 length - buffer_length);
200b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch  return length;
201a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}
202a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
203a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}  // namespace unibrow
204a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
205a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#endif  // V8_UNICODE_INL_H_
206