13ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch// Copyright 2011 the V8 project authors. All rights reserved.
2a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// Redistribution and use in source and binary forms, with or without
3a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// modification, are permitted provided that the following conditions are
4a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// met:
5a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//
6a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//     * Redistributions of source code must retain the above copyright
7a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//       notice, this list of conditions and the following disclaimer.
8a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//     * Redistributions in binary form must reproduce the above
9a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//       copyright notice, this list of conditions and the following
10a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//       disclaimer in the documentation and/or other materials provided
11a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//       with the distribution.
12a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//     * Neither the name of Google Inc. nor the names of its
13a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//       contributors may be used to endorse or promote products derived
14a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//       from this software without specific prior written permission.
15a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block//
16a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
28a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#ifndef V8_UNICODE_H_
29a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#define V8_UNICODE_H_
30a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
31a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#include <sys/types.h>
32a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
33a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block/**
34a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * \file
35a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * Definitions and convenience functions for working with unicode.
36a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block */
37a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
38a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocknamespace unibrow {
39a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
40a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktypedef unsigned int uchar;
41a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktypedef unsigned char byte;
42a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
43a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block/**
44a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * The max length of the result of converting the case of a single
45a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * character.
46a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block */
473ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochconst int kMaxMappingSize = 4;
48a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
49a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int size = 256>
50a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Predicate {
51a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
52a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline Predicate() { }
53a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline bool get(uchar c);
54a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private:
55a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  friend class Test;
56a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  bool CalculateValue(uchar c);
57a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  struct CacheEntry {
58a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    inline CacheEntry() : code_point_(0), value_(0) { }
59a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    inline CacheEntry(uchar code_point, bool value)
60a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      : code_point_(code_point),
61a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block        value_(value) { }
62a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    uchar code_point_ : 21;
63a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    bool value_ : 1;
64a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  };
65a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kSize = size;
66a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMask = kSize - 1;
67a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  CacheEntry entries_[kSize];
68a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
69a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
70a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// A cache used in case conversion.  It caches the value for characters
71a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// that either have no mapping or map to a single character independent
72a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// of context.  Characters that map to more than one character or that
73a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// map differently depending on context are always looked up.
74a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int size = 256>
75a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Mapping {
76a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
77a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline Mapping() { }
78a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline int get(uchar c, uchar n, uchar* result);
79a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private:
80a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  friend class Test;
81a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  int CalculateValue(uchar c, uchar n, uchar* result);
82a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  struct CacheEntry {
83a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    inline CacheEntry(uchar code_point, signed offset)
85a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      : code_point_(code_point),
86a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block        offset_(offset) { }
87a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    uchar code_point_;
88a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    signed offset_;
89a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    static const int kNoChar = (1 << 21) - 1;
90a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  };
91a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kSize = size;
92a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMask = kSize - 1;
93a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  CacheEntry entries_[kSize];
94a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
95a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
96a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass UnicodeData {
97a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private:
98a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  friend class Test;
99a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static int GetByteCount();
10044f0eee88ff00398ff7f715fab053374d808c90dSteve Block  static const uchar kMaxCodePoint;
101a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
102a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
1033ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch// --- U t f   8   a n d   16 ---
104a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
105a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <typename Data>
106a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Buffer {
107a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
108a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline Buffer() : data_(0), length_(0) { }
110a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  Data data() { return data_; }
111a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  unsigned length() { return length_; }
112a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private:
113a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  Data data_;
114a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  unsigned length_;
115a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
116a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
1173ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch
1183ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochclass Utf16 {
1193ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch public:
1203ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline bool IsLeadSurrogate(int code) {
1213ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    if (code == kNoPreviousCharacter) return false;
1223ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    return (code & 0xfc00) == 0xd800;
1233ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  }
1243ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline bool IsTrailSurrogate(int code) {
1253ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    if (code == kNoPreviousCharacter) return false;
1263ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    return (code & 0xfc00) == 0xdc00;
1273ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  }
1283ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch
1293ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
1303ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
1313ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  }
1323ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static const int kNoPreviousCharacter = -1;
1333ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static const uchar kMaxNonSurrogateCharCode = 0xffff;
1343ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
1353ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // of UTF-8 data.  The special case where the unit is a surrogate
1363ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // trail produces 1 byte net, because the encoding of the pair is
1373ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
1383ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // can be reclaimed.
1393ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
1403ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
1413ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // The illegality stems from the surrogate not being part of a pair.
1423ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static const int kUtf8BytesToCodeASurrogate = 3;
1433ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline uchar LeadSurrogate(int char_code) {
1443ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
1453ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  }
1463ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline uchar TrailSurrogate(int char_code) {
1473ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch    return 0xdc00 + (char_code & 0x3ff);
1483ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  }
1493ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch};
1503ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch
1513ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch
152a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Utf8 {
153a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
1543ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline uchar Length(uchar chr, int previous);
1553ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static inline unsigned Encode(
1563ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch      char* out, uchar c, int previous);
157a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      unsigned capacity, unsigned* chars_read, unsigned* offset);
1590d5e116f6aee03185f237311a943491bb079a768Kristian Monsen  static uchar CalculateValue(const byte* str,
1600d5e116f6aee03185f237311a943491bb079a768Kristian Monsen                              unsigned length,
1610d5e116f6aee03185f237311a943491bb079a768Kristian Monsen                              unsigned* cursor);
162a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const uchar kBadChar = 0xFFFD;
163a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const unsigned kMaxEncodedSize   = 4;
164a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const unsigned kMaxOneByteChar   = 0x7f;
165a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const unsigned kMaxTwoByteChar   = 0x7ff;
166a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const unsigned kMaxThreeByteChar = 0xffff;
167a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const unsigned kMaxFourByteChar  = 0x1fffff;
168a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
1693ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
1703ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  // that match are coded as a 4 byte UTF-8 sequence.
1713ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static const unsigned kBytesSavedByCombiningSurrogates = 2;
1723ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  static const unsigned kSizeOfUnmatchedSurrogate = 3;
1733ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch
174a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private:
175a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  template <unsigned s> friend class Utf8InputBuffer;
176a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  friend class Test;
177a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static inline uchar ValueOf(const byte* str,
178a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                              unsigned length,
179a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                              unsigned* cursor);
180a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
181a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
182a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// --- C h a r a c t e r   S t r e a m ---
183a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
184a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass CharacterStream {
185a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
186a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline uchar GetNext();
187a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline bool has_more() { return remaining_ != 0; }
188a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // Note that default implementation is not efficient.
189a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  virtual void Seek(unsigned);
190a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  unsigned Length();
1913ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch  unsigned Utf16Length();
192a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  virtual ~CharacterStream() { }
193a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
194a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      unsigned& offset);
195a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      unsigned capacity, unsigned& offset);
197a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block      unsigned capacity, unsigned& offset);
199a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  virtual void Rewind() = 0;
2013ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch
202a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block protected:
203a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  virtual void FillBuffer() = 0;
204a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // The number of characters left in the current buffer
205a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  unsigned remaining_;
206a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // The current offset within the buffer
207a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  unsigned cursor_;
208a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // The buffer containing the decoded characters.
209a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  const byte* buffer_;
210a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
211a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
212a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// --- I n p u t   B u f f e r ---
213a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
214a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block/**
215a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * Provides efficient access to encoded characters in strings.  It
216a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * does so by reading characters one block at a time, rather than one
217a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * character at a time, which gives string implementations an
218a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * opportunity to optimize the decoding.
219a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block */
220a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class Reader, class Input = Reader*, unsigned kSize = 256>
221a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass InputBuffer : public CharacterStream {
222a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
223a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  virtual void Rewind();
224a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline void Reset(Input input);
225a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  void Seek(unsigned position);
226a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline void Reset(unsigned position, Input input);
227a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block protected:
228a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  InputBuffer() { }
229a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  explicit InputBuffer(Input input) { Reset(input); }
230a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  virtual void FillBuffer();
231a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
232a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // A custom offset that can be used by the string implementation to
233a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // mark progress within the encoded string.
234a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  unsigned offset_;
235a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // The input string
236a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  Input input_;
237a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // To avoid heap allocation, we keep an internal buffer to which
238a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // the encoded string can write its characters.  The string
239a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // implementation is free to decide whether it wants to use this
240a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  // buffer or not.
241a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  byte util_buffer_[kSize];
242a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
243a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
244a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// --- U t f 8   I n p u t   B u f f e r ---
245a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
246a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <unsigned s = 256>
247a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
248a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public:
249a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline Utf8InputBuffer() { }
250a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline Utf8InputBuffer(const char* data, unsigned length);
251a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  inline void Reset(const char* data, unsigned length) {
252a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
253a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block        Buffer<const char*>(data, length));
254a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  }
255a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
256a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
257bb769b257e753aafcbd96767abb2abc645eaa20cBen Murdoch
258a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Uppercase {
259a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
260a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
261a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Lowercase {
262a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
263a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
264a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Letter {
265a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
266a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
267a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Space {
268a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
269a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
270a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Number {
271a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
272a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
273a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct WhiteSpace {
274a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
275a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
276a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct LineTerminator {
277a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
278a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
279a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct CombiningMark {
280a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
281a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
282a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct ConnectorPunctuation {
283a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static bool Is(uchar c);
284a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
285a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct ToLowercase {
286a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMaxWidth = 3;
287a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static int Convert(uchar c,
288a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar n,
289a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar* result,
290a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     bool* allow_caching_ptr);
291a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
292a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct ToUppercase {
293a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMaxWidth = 3;
294a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static int Convert(uchar c,
295a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar n,
296a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar* result,
297a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     bool* allow_caching_ptr);
298a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
299a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Ecma262Canonicalize {
300a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMaxWidth = 1;
301a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static int Convert(uchar c,
302a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar n,
303a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar* result,
304a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     bool* allow_caching_ptr);
305a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
306a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Ecma262UnCanonicalize {
307a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMaxWidth = 4;
308a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static int Convert(uchar c,
309a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar n,
310a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar* result,
311a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     bool* allow_caching_ptr);
312a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
313a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct CanonicalizationRange {
314a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static const int kMaxWidth = 1;
315a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block  static int Convert(uchar c,
316a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar n,
317a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     uchar* result,
318a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block                     bool* allow_caching_ptr);
319a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block};
320a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
321a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}  // namespace unibrow
322a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block
323a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#endif  // V8_UNICODE_H_
324