13ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch// Copyright 2011 the V8 project authors. All rights reserved. 2a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// Redistribution and use in source and binary forms, with or without 3a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// modification, are permitted provided that the following conditions are 4a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// met: 5a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// 6a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// * Redistributions of source code must retain the above copyright 7a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// notice, this list of conditions and the following disclaimer. 8a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// * Redistributions in binary form must reproduce the above 9a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// copyright notice, this list of conditions and the following 10a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// disclaimer in the documentation and/or other materials provided 11a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// with the distribution. 12a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// * Neither the name of Google Inc. nor the names of its 13a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// contributors may be used to endorse or promote products derived 14a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// from this software without specific prior written permission. 15a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// 16a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 28a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#ifndef V8_UNICODE_H_ 29a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#define V8_UNICODE_H_ 30a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 31a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#include <sys/types.h> 32a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 33a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block/** 34a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * \file 35a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * Definitions and convenience functions for working with unicode. 36a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block */ 37a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 38a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocknamespace unibrow { 39a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 40a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktypedef unsigned int uchar; 41a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktypedef unsigned char byte; 42a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 43a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block/** 44a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * The max length of the result of converting the case of a single 45a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * character. 46a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block */ 473ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochconst int kMaxMappingSize = 4; 48a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 49a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int size = 256> 50a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Predicate { 51a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 52a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline Predicate() { } 53a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline bool get(uchar c); 54a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private: 55a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block friend class Test; 56a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool CalculateValue(uchar c); 57a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block struct CacheEntry { 58a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline CacheEntry() : code_point_(0), value_(0) { } 59a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline CacheEntry(uchar code_point, bool value) 60a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block : code_point_(code_point), 61a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block value_(value) { } 62a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar code_point_ : 21; 63a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool value_ : 1; 64a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block }; 65a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kSize = size; 66a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMask = kSize - 1; 67a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block CacheEntry entries_[kSize]; 68a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 69a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 70a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// A cache used in case conversion. It caches the value for characters 71a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// that either have no mapping or map to a single character independent 72a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// of context. Characters that map to more than one character or that 73a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// map differently depending on context are always looked up. 74a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class T, int size = 256> 75a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Mapping { 76a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 77a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline Mapping() { } 78a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline int get(uchar c, uchar n, uchar* result); 79a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private: 80a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block friend class Test; 81a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block int CalculateValue(uchar c, uchar n, uchar* result); 82a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block struct CacheEntry { 83a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline CacheEntry() : code_point_(kNoChar), offset_(0) { } 84a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline CacheEntry(uchar code_point, signed offset) 85a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block : code_point_(code_point), 86a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block offset_(offset) { } 87a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar code_point_; 88a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block signed offset_; 89a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kNoChar = (1 << 21) - 1; 90a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block }; 91a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kSize = size; 92a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMask = kSize - 1; 93a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block CacheEntry entries_[kSize]; 94a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 95a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 96a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass UnicodeData { 97a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private: 98a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block friend class Test; 99a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static int GetByteCount(); 10044f0eee88ff00398ff7f715fab053374d808c90dSteve Block static const uchar kMaxCodePoint; 101a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 102a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 1033ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch// --- U t f 8 a n d 16 --- 104a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 105a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <typename Data> 106a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Buffer { 107a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 108a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } 109a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline Buffer() : data_(0), length_(0) { } 110a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block Data data() { return data_; } 111a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned length() { return length_; } 112a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private: 113a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block Data data_; 114a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned length_; 115a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 116a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 1173ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 1183ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochclass Utf16 { 1193ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch public: 1203ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline bool IsLeadSurrogate(int code) { 1213ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch if (code == kNoPreviousCharacter) return false; 1223ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return (code & 0xfc00) == 0xd800; 1233ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 1243ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline bool IsTrailSurrogate(int code) { 1253ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch if (code == kNoPreviousCharacter) return false; 1263ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return (code & 0xfc00) == 0xdc00; 1273ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 1283ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 1293ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline int CombineSurrogatePair(uchar lead, uchar trail) { 1303ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); 1313ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 1323ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static const int kNoPreviousCharacter = -1; 1333ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static const uchar kMaxNonSurrogateCharCode = 0xffff; 1343ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes 1353ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // of UTF-8 data. The special case where the unit is a surrogate 1363ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // trail produces 1 byte net, because the encoding of the pair is 1373ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // 4 bytes and the 3 bytes that were used to encode the lead surrogate 1383ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // can be reclaimed. 1393ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; 1403ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. 1413ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // The illegality stems from the surrogate not being part of a pair. 1423ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static const int kUtf8BytesToCodeASurrogate = 3; 1433ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline uchar LeadSurrogate(int char_code) { 1443ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); 1453ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 1463ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline uchar TrailSurrogate(int char_code) { 1473ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return 0xdc00 + (char_code & 0x3ff); 1483ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 1493ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch}; 1503ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 1513ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 152a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Utf8 { 153a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 1543ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline uchar Length(uchar chr, int previous); 1553ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static inline unsigned Encode( 1563ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch char* out, uchar c, int previous); 157a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, 158a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned capacity, unsigned* chars_read, unsigned* offset); 1590d5e116f6aee03185f237311a943491bb079a768Kristian Monsen static uchar CalculateValue(const byte* str, 1600d5e116f6aee03185f237311a943491bb079a768Kristian Monsen unsigned length, 1610d5e116f6aee03185f237311a943491bb079a768Kristian Monsen unsigned* cursor); 162a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const uchar kBadChar = 0xFFFD; 163a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const unsigned kMaxEncodedSize = 4; 164a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const unsigned kMaxOneByteChar = 0x7f; 165a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const unsigned kMaxTwoByteChar = 0x7ff; 166a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const unsigned kMaxThreeByteChar = 0xffff; 167a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const unsigned kMaxFourByteChar = 0x1fffff; 168a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 1693ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together 1703ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch // that match are coded as a 4 byte UTF-8 sequence. 1713ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static const unsigned kBytesSavedByCombiningSurrogates = 2; 1723ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch static const unsigned kSizeOfUnmatchedSurrogate = 3; 1733ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 174a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block private: 175a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block template <unsigned s> friend class Utf8InputBuffer; 176a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block friend class Test; 177a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static inline uchar ValueOf(const byte* str, 178a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned length, 179a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned* cursor); 180a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 181a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 182a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// --- C h a r a c t e r S t r e a m --- 183a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 184a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass CharacterStream { 185a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 186a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline uchar GetNext(); 187a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline bool has_more() { return remaining_ != 0; } 188a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // Note that default implementation is not efficient. 189a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block virtual void Seek(unsigned); 190a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned Length(); 1913ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch unsigned Utf16Length(); 192a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block virtual ~CharacterStream() { } 193a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, 194a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned& offset); 195a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, 196a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned capacity, unsigned& offset); 197a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, 198a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned capacity, unsigned& offset); 199a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); 200a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block virtual void Rewind() = 0; 2013ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 202a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block protected: 203a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block virtual void FillBuffer() = 0; 204a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // The number of characters left in the current buffer 205a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned remaining_; 206a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // The current offset within the buffer 207a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned cursor_; 208a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // The buffer containing the decoded characters. 209a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block const byte* buffer_; 210a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 211a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 212a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// --- I n p u t B u f f e r --- 213a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 214a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block/** 215a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * Provides efficient access to encoded characters in strings. It 216a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * does so by reading characters one block at a time, rather than one 217a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * character at a time, which gives string implementations an 218a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block * opportunity to optimize the decoding. 219a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block */ 220a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <class Reader, class Input = Reader*, unsigned kSize = 256> 221a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass InputBuffer : public CharacterStream { 222a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 223a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block virtual void Rewind(); 224a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline void Reset(Input input); 225a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block void Seek(unsigned position); 226a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline void Reset(unsigned position, Input input); 227a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block protected: 228a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block InputBuffer() { } 229a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block explicit InputBuffer(Input input) { Reset(input); } 230a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block virtual void FillBuffer(); 231a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 232a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // A custom offset that can be used by the string implementation to 233a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // mark progress within the encoded string. 234a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block unsigned offset_; 235a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // The input string 236a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block Input input_; 237a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // To avoid heap allocation, we keep an internal buffer to which 238a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // the encoded string can write its characters. The string 239a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // implementation is free to decide whether it wants to use this 240a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block // buffer or not. 241a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block byte util_buffer_[kSize]; 242a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 243a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 244a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block// --- U t f 8 I n p u t B u f f e r --- 245a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 246a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blocktemplate <unsigned s = 256> 247a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockclass Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> { 248a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block public: 249a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline Utf8InputBuffer() { } 250a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline Utf8InputBuffer(const char* data, unsigned length); 251a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block inline void Reset(const char* data, unsigned length) { 252a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block InputBuffer<Utf8, Buffer<const char*>, s>::Reset( 253a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block Buffer<const char*>(data, length)); 254a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block } 255a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 256a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 257bb769b257e753aafcbd96767abb2abc645eaa20cBen Murdoch 258a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Uppercase { 259a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 260a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 261a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Lowercase { 262a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 263a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 264a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Letter { 265a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 266a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 267a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Space { 268a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 269a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 270a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Number { 271a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 272a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 273a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct WhiteSpace { 274a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 275a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 276a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct LineTerminator { 277a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 278a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 279a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct CombiningMark { 280a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 281a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 282a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct ConnectorPunctuation { 283a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static bool Is(uchar c); 284a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 285a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct ToLowercase { 286a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMaxWidth = 3; 287a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static int Convert(uchar c, 288a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar n, 289a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result, 290a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool* allow_caching_ptr); 291a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 292a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct ToUppercase { 293a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMaxWidth = 3; 294a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static int Convert(uchar c, 295a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar n, 296a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result, 297a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool* allow_caching_ptr); 298a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 299a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Ecma262Canonicalize { 300a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMaxWidth = 1; 301a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static int Convert(uchar c, 302a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar n, 303a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result, 304a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool* allow_caching_ptr); 305a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 306a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct Ecma262UnCanonicalize { 307a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMaxWidth = 4; 308a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static int Convert(uchar c, 309a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar n, 310a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result, 311a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool* allow_caching_ptr); 312a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 313a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Blockstruct CanonicalizationRange { 314a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static const int kMaxWidth = 1; 315a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block static int Convert(uchar c, 316a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar n, 317a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block uchar* result, 318a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block bool* allow_caching_ptr); 319a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block}; 320a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 321a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block} // namespace unibrow 322a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block 323a7e24c173cf37484693b9abb38e494fa7bd7baebSteve Block#endif // V8_UNICODE_H_ 324