1// Copyright 2007-2010 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_UNICODE_INL_H_
6#define V8_UNICODE_INL_H_
7
8#include "src/unicode.h"
9#include "src/base/logging.h"
10#include "src/utils.h"
11
12namespace unibrow {
13
14template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
15  CacheEntry entry = entries_[code_point & kMask];
16  if (entry.code_point() == code_point) return entry.value();
17  return CalculateValue(code_point);
18}
19
20template <class T, int s> bool Predicate<T, s>::CalculateValue(
21    uchar code_point) {
22  bool result = T::Is(code_point);
23  entries_[code_point & kMask] = CacheEntry(code_point, result);
24  return result;
25}
26
27template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
28    uchar* result) {
29  CacheEntry entry = entries_[c & kMask];
30  if (entry.code_point_ == c) {
31    if (entry.offset_ == 0) {
32      return 0;
33    } else {
34      result[0] = c + entry.offset_;
35      return 1;
36    }
37  } else {
38    return CalculateValue(c, n, result);
39  }
40}
41
42template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
43    uchar* result) {
44  bool allow_caching = true;
45  int length = T::Convert(c, n, result, &allow_caching);
46  if (allow_caching) {
47    if (length == 1) {
48      entries_[c & kMask] = CacheEntry(c, result[0] - c);
49      return 1;
50    } else {
51      entries_[c & kMask] = CacheEntry(c, 0);
52      return 0;
53    }
54  } else {
55    return length;
56  }
57}
58
59
60unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
61  static const int kMask = ~(1 << 6);
62  if (c <= kMaxOneByteChar) {
63    str[0] = c;
64    return 1;
65  }
66  str[0] = 0xC0 | (c >> 6);
67  str[1] = 0x80 | (c & kMask);
68  return 2;
69}
70
71// Encode encodes the UTF-16 code units c and previous into the given str
72// buffer, and combines surrogate code units into single code points. If
73// replace_invalid is set to true, orphan surrogate code units will be replaced
74// with kBadChar.
75unsigned Utf8::Encode(char* str,
76                      uchar c,
77                      int previous,
78                      bool replace_invalid) {
79  static const int kMask = ~(1 << 6);
80  if (c <= kMaxOneByteChar) {
81    str[0] = c;
82    return 1;
83  } else if (c <= kMaxTwoByteChar) {
84    str[0] = 0xC0 | (c >> 6);
85    str[1] = 0x80 | (c & kMask);
86    return 2;
87  } else if (c <= kMaxThreeByteChar) {
88    if (Utf16::IsSurrogatePair(previous, c)) {
89      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
90      return Encode(str - kUnmatchedSize,
91                    Utf16::CombineSurrogatePair(previous, c),
92                    Utf16::kNoPreviousCharacter,
93                    replace_invalid) - kUnmatchedSize;
94    } else if (replace_invalid &&
95               (Utf16::IsLeadSurrogate(c) ||
96               Utf16::IsTrailSurrogate(c))) {
97      c = kBadChar;
98    }
99    str[0] = 0xE0 | (c >> 12);
100    str[1] = 0x80 | ((c >> 6) & kMask);
101    str[2] = 0x80 | (c & kMask);
102    return 3;
103  } else {
104    str[0] = 0xF0 | (c >> 18);
105    str[1] = 0x80 | ((c >> 12) & kMask);
106    str[2] = 0x80 | ((c >> 6) & kMask);
107    str[3] = 0x80 | (c & kMask);
108    return 4;
109  }
110}
111
112
113uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
114  if (length <= 0) return kBadChar;
115  byte first = bytes[0];
116  // Characters between 0000 and 0007F are encoded as a single character
117  if (first <= kMaxOneByteChar) {
118    *cursor += 1;
119    return first;
120  }
121  return CalculateValue(bytes, length, cursor);
122}
123
124unsigned Utf8::Length(uchar c, int previous) {
125  if (c <= kMaxOneByteChar) {
126    return 1;
127  } else if (c <= kMaxTwoByteChar) {
128    return 2;
129  } else if (c <= kMaxThreeByteChar) {
130    if (Utf16::IsTrailSurrogate(c) &&
131        Utf16::IsLeadSurrogate(previous)) {
132      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
133    }
134    return 3;
135  } else {
136    return 4;
137  }
138}
139
140bool Utf8::IsValidCharacter(uchar c) {
141  return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
142         (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
143          c != kBadChar);
144}
145
146}  // namespace unibrow
147
148#endif  // V8_UNICODE_INL_H_
149