unicode-inl.h revision b8a8cc1952d61a2f3a2568848933943a543b5d3e
1e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen// Copyright 2007-2010 the V8 project authors. All rights reserved.
2e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen// Use of this source code is governed by a BSD-style license that can be
3e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen// found in the LICENSE file.
4e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen
5e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#ifndef V8_UNICODE_INL_H_
6e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#define V8_UNICODE_INL_H_
7e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen
8e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#include "src/unicode.h"
9e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#include "src/base/logging.h"
10e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen#include "src/utils.h"
11e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen
12e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chennamespace unibrow {
13e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen
14e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chentemplate <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
15e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen  CacheEntry entry = entries_[code_point & kMask];
16e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen  if (entry.code_point_ == code_point) return entry.value_;
17e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park  return CalculateValue(code_point);
18e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park}
19e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park
20e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chentemplate <class T, int s> bool Predicate<T, s>::CalculateValue(
21280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv    uchar code_point) {
229aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv  bool result = T::Is(code_point);
239aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv  entries_[code_point & kMask] = CacheEntry(code_point, result);
249aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv  return result;
25a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup}
269aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv
27a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstruptemplate <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
289aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv    uchar* result) {
29c3571b24025ff81d49c6e71c79f7a47269fc1c5fRakesh Iyer  CacheEntry entry = entries_[c & kMask];
309aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv  if (entry.code_point_ == c) {
315997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh    if (entry.offset_ == 0) {
325997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh      return 0;
335997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh    } else {
349aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv      result[0] = c + entry.offset_;
359aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv      return 1;
369aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv    }
37a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup  } else {
38a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup    return CalculateValue(c, n, result);
39a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup  }
40e13d8ef364d2e2226562f7e27c9ae353502ba113Jason Tholstrup}
41a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup
42a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstruptemplate <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
43a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup    uchar* result) {
44a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup  bool allow_caching = true;
45a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup  int length = T::Convert(c, n, result, &allow_caching);
469aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv  if (allow_caching) {
479aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv    if (length == 1) {
489aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv      entries_[c & kMask] = CacheEntry(c, result[0] - c);
499aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv      return 1;
50a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup    } else {
519aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv      entries_[c & kMask] = CacheEntry(c, 0);
529aef1ae7c554f7f77917dd57cfe5f5dabd96b454Vitalii Tomkiv      return 0;
53280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv    }
54a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  } else {
55280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv    return length;
56280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv  }
57e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park}
585997cb5d9ff7e5bd09a986623db0862faea5aa80Ishani Parekh
59e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen
60e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Parkuint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
61e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park  DCHECK(c > Latin1::kMaxChar);
6212ebb3398f19075c33c6482ecce4c0c3b6033dcbYao Chen  switch (c) {
63e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park    // This are equivalent characters in unicode.
646d5847b1cf70efb8d70cd6704459ccc88f561925Rakesh Iyer    case 0x39c:
65e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park    case 0x3bc:
66e13d8ef364d2e2226562f7e27c9ae353502ba113Jason Tholstrup      return 0xb5;
67e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park    // This is an uppercase of a Latin-1 character
68e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen    // outside of Latin-1.
69280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv    case 0x178:
70280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv      return 0xff;
71280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv  }
72280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv  return 0;
73a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup}
74280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv
75280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv
769d4b05da918f8c4414c9c57686c9fa802d6eec13Vitalii Tomkivunsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
77a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup  static const int kMask = ~(1 << 6);
78280b5721254e5ac974404e02e7589f17f560d1f9Vitalii Tomkiv  if (c <= kMaxOneByteChar) {
79e2de1d52bab76ae6be41dc2f2b48a9eb546e70bbYao Chen    str[0] = c;
80a0b23acf599b7cf57e1941352fae44e3c984c605Jason Tholstrup    return 1;
81a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  }
82a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  str[0] = 0xC0 | (c >> 6);
83a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  str[1] = 0x80 | (c & kMask);
84a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  return 2;
85a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer}
86a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer
87a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// Encode encodes the UTF-16 code units c and previous into the given str
88a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// buffer, and combines surrogate code units into single code points. If
89a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// replace_invalid is set to true, orphan surrogate code units will be replaced
90a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer// with kBadChar.
913388e7848f3a30029935463afafe9b8280939127Keun-young Parkunsigned Utf8::Encode(char* str,
92a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer                      uchar c,
93a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer                      int previous,
94a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer                      bool replace_invalid) {
95a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  static const int kMask = ~(1 << 6);
96a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  if (c <= kMaxOneByteChar) {
97a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer    str[0] = c;
98a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer    return 1;
99a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  } else if (c <= kMaxTwoByteChar) {
100a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer    str[0] = 0xC0 | (c >> 6);
1019688038518ab6bb23841d94b68b9597122b4a279Scott Main    str[1] = 0x80 | (c & kMask);
102a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer    return 2;
103a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer  } else if (c <= kMaxThreeByteChar) {
104a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer    if (Utf16::IsSurrogatePair(previous, c)) {
105a4ae996169999ab5fc8eedd3106e247bc5ff219fRakesh Iyer      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
106e54ac276796c6535558f8444d882adecd19ce2bdKeun-young Park      return Encode(str - kUnmatchedSize,
107                    Utf16::CombineSurrogatePair(previous, c),
108                    Utf16::kNoPreviousCharacter,
109                    replace_invalid) - kUnmatchedSize;
110    } else if (replace_invalid &&
111               (Utf16::IsLeadSurrogate(c) ||
112               Utf16::IsTrailSurrogate(c))) {
113      c = kBadChar;
114    }
115    str[0] = 0xE0 | (c >> 12);
116    str[1] = 0x80 | ((c >> 6) & kMask);
117    str[2] = 0x80 | (c & kMask);
118    return 3;
119  } else {
120    str[0] = 0xF0 | (c >> 18);
121    str[1] = 0x80 | ((c >> 12) & kMask);
122    str[2] = 0x80 | ((c >> 6) & kMask);
123    str[3] = 0x80 | (c & kMask);
124    return 4;
125  }
126}
127
128
129uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
130  if (length <= 0) return kBadChar;
131  byte first = bytes[0];
132  // Characters between 0000 and 0007F are encoded as a single character
133  if (first <= kMaxOneByteChar) {
134    *cursor += 1;
135    return first;
136  }
137  return CalculateValue(bytes, length, cursor);
138}
139
140unsigned Utf8::Length(uchar c, int previous) {
141  if (c <= kMaxOneByteChar) {
142    return 1;
143  } else if (c <= kMaxTwoByteChar) {
144    return 2;
145  } else if (c <= kMaxThreeByteChar) {
146    if (Utf16::IsTrailSurrogate(c) &&
147        Utf16::IsLeadSurrogate(previous)) {
148      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
149    }
150    return 3;
151  } else {
152    return 4;
153  }
154}
155
156Utf8DecoderBase::Utf8DecoderBase()
157  : unbuffered_start_(NULL),
158    utf16_length_(0),
159    last_byte_of_buffer_unused_(false) {}
160
161Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
162                                 unsigned buffer_length,
163                                 const uint8_t* stream,
164                                 unsigned stream_length) {
165  Reset(buffer, buffer_length, stream, stream_length);
166}
167
168template<unsigned kBufferSize>
169Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
170  : Utf8DecoderBase(buffer_,
171                    kBufferSize,
172                    reinterpret_cast<const uint8_t*>(stream),
173                    length) {
174}
175
176template<unsigned kBufferSize>
177void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
178  Utf8DecoderBase::Reset(buffer_,
179                         kBufferSize,
180                         reinterpret_cast<const uint8_t*>(stream),
181                         length);
182}
183
184template <unsigned kBufferSize>
185unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
186                                              unsigned length) const {
187  DCHECK(length > 0);
188  if (length > utf16_length_) length = utf16_length_;
189  // memcpy everything in buffer.
190  unsigned buffer_length =
191      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
192  unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
193  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
194  if (length <= buffer_length) return length;
195  DCHECK(unbuffered_start_ != NULL);
196  // Copy the rest the slow way.
197  WriteUtf16Slow(unbuffered_start_,
198                 data + buffer_length,
199                 length - buffer_length);
200  return length;
201}
202
203}  // namespace unibrow
204
205#endif  // V8_UNICODE_INL_H_
206