1// Copyright 2014 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_UNICODE_DECODER_H_
6#define V8_UNICODE_DECODER_H_
7
8#include <sys/types.h>
9#include "src/globals.h"
10#include "src/utils.h"
11
12namespace unibrow {
13
14class V8_EXPORT_PRIVATE Utf8DecoderBase {
15 public:
16  // Initialization done in subclass.
17  inline Utf8DecoderBase();
18  inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
19                         const uint8_t* stream, size_t stream_length);
20  inline size_t Utf16Length() const { return utf16_length_; }
21
22 protected:
23  // This reads all characters and sets the utf16_length_.
24  // The first buffer_length utf16 chars are cached in the buffer.
25  void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
26             size_t stream_length);
27  static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
28                             uint16_t* data, size_t length);
29  const uint8_t* unbuffered_start_;
30  size_t unbuffered_length_;
31  size_t utf16_length_;
32  bool last_byte_of_buffer_unused_;
33
34 private:
35  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
36};
37
38template <size_t kBufferSize>
39class Utf8Decoder : public Utf8DecoderBase {
40 public:
41  inline Utf8Decoder() {}
42  inline Utf8Decoder(const char* stream, size_t length);
43  inline void Reset(const char* stream, size_t length);
44  inline size_t WriteUtf16(uint16_t* data, size_t length) const;
45
46 private:
47  uint16_t buffer_[kBufferSize];
48};
49
50
51Utf8DecoderBase::Utf8DecoderBase()
52    : unbuffered_start_(NULL),
53      unbuffered_length_(0),
54      utf16_length_(0),
55      last_byte_of_buffer_unused_(false) {}
56
57
58Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
59                                 const uint8_t* stream, size_t stream_length) {
60  Reset(buffer, buffer_length, stream, stream_length);
61}
62
63
64template <size_t kBufferSize>
65Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
66    : Utf8DecoderBase(buffer_, kBufferSize,
67                      reinterpret_cast<const uint8_t*>(stream), length) {}
68
69
70template <size_t kBufferSize>
71void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
72  Utf8DecoderBase::Reset(buffer_, kBufferSize,
73                         reinterpret_cast<const uint8_t*>(stream), length);
74}
75
76
77template <size_t kBufferSize>
78size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
79                                            size_t length) const {
80  DCHECK(length > 0);
81  if (length > utf16_length_) length = utf16_length_;
82  // memcpy everything in buffer.
83  size_t buffer_length =
84      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
85  size_t memcpy_length = length <= buffer_length ? length : buffer_length;
86  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
87  if (length <= buffer_length) return length;
88  DCHECK(unbuffered_start_ != NULL);
89  // Copy the rest the slow way.
90  WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
91                 length - buffer_length);
92  return length;
93}
94
95class Latin1 {
96 public:
97  static const unsigned kMaxChar = 0xff;
98  // Returns 0 if character does not convert to single latin-1 character
99  // or if the character doesn't not convert back to latin-1 via inverse
100  // operation (upper to lower, etc).
101  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
102};
103
104
105uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
106  DCHECK(c > Latin1::kMaxChar);
107  switch (c) {
108    // This are equivalent characters in unicode.
109    case 0x39c:
110    case 0x3bc:
111      return 0xb5;
112    // This is an uppercase of a Latin-1 character
113    // outside of Latin-1.
114    case 0x178:
115      return 0xff;
116  }
117  return 0;
118}
119
120
121}  // namespace unibrow
122
123#endif  // V8_UNICODE_DECODER_H_
124