1/* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "wtf/text/TextEncoding.h" 30 31#include "wtf/text/TextEncodingRegistry.h" 32#include <unicode/unorm.h> 33#include "wtf/OwnPtr.h" 34#include "wtf/StdLibExtras.h" 35#include "wtf/text/CString.h" 36#include "wtf/text/WTFString.h" 37 38namespace WTF { 39 40static const TextEncoding& UTF7Encoding() 41{ 42 static TextEncoding globalUTF7Encoding("UTF-7"); 43 return globalUTF7Encoding; 44} 45 46TextEncoding::TextEncoding(const char* name) 47 : m_name(atomicCanonicalTextEncodingName(name)) 48{ 49 // Aliases are valid, but not "replacement" itself. 50 if (m_name && isReplacementEncoding(name)) 51 m_name = 0; 52} 53 54TextEncoding::TextEncoding(const String& name) 55 : m_name(atomicCanonicalTextEncodingName(name)) 56{ 57 // Aliases are valid, but not "replacement" itself. 58 if (m_name && isReplacementEncoding(name)) 59 m_name = 0; 60} 61 62String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 63{ 64 if (!m_name) 65 return String(); 66 67 return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError); 68} 69 70CString TextEncoding::encode(const String& string, UnencodableHandling handling) const 71{ 72 if (!m_name) 73 return CString(); 74 75 if (string.isEmpty()) 76 return ""; 77 78 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 79 CString encodedString; 80 if (string.is8Bit()) 81 encodedString = textCodec->encode(string.characters8(), string.length(), handling); 82 else 83 encodedString = textCodec->encode(string.characters16(), string.length(), handling); 84 return encodedString; 85} 86 87CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const 88{ 89 if (!m_name) 90 return CString(); 91 92 if (string.isEmpty()) 93 return ""; 94 95 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left 96 // unaffected by NFC. This is effectively the same as saying that all 97 // Latin-1 text is already normalized to NFC. 98 // Source: http://unicode.org/reports/tr15/ 99 if (string.is8Bit()) 100 return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); 101 102 const UChar* source = string.characters16(); 103 size_t length = string.length(); 104 105 Vector<UChar> normalizedCharacters; 106 107 UErrorCode err = U_ZERO_ERROR; 108 if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { 109 // First try using the length of the original string, since normalization to NFC rarely increases length. 110 normalizedCharacters.grow(length); 111 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); 112 if (err == U_BUFFER_OVERFLOW_ERROR) { 113 err = U_ZERO_ERROR; 114 normalizedCharacters.resize(normalizedLength); 115 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 116 } 117 ASSERT(U_SUCCESS(err)); 118 119 source = normalizedCharacters.data(); 120 length = normalizedLength; 121 } 122 123 return newTextCodec(*this)->encode(source, length, handling); 124} 125 126bool TextEncoding::usesVisualOrdering() const 127{ 128 if (noExtendedTextEncodingNameUsed()) 129 return false; 130 131 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 132 return m_name == a; 133} 134 135bool TextEncoding::isNonByteBasedEncoding() const 136{ 137 if (noExtendedTextEncodingNameUsed()) { 138 return *this == UTF16LittleEndianEncoding() 139 || *this == UTF16BigEndianEncoding(); 140 } 141 142 return *this == UTF16LittleEndianEncoding() 143 || *this == UTF16BigEndianEncoding() 144 || *this == UTF32BigEndianEncoding() 145 || *this == UTF32LittleEndianEncoding(); 146} 147 148bool TextEncoding::isUTF7Encoding() const 149{ 150 if (noExtendedTextEncodingNameUsed()) 151 return false; 152 153 return *this == UTF7Encoding(); 154} 155 156const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 157{ 158 if (isNonByteBasedEncoding()) 159 return UTF8Encoding(); 160 return *this; 161} 162 163// HTML5 specifies that UTF-8 be used in form submission when a form is 164// is a part of a document in UTF-16 probably because UTF-16 is not a 165// byte-based encoding and can contain 0x00. By extension, the same 166// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 167// but it's fraught with problems and we'd rather steer clear of it. 168const TextEncoding& TextEncoding::encodingForFormSubmission() const 169{ 170 if (isNonByteBasedEncoding() || isUTF7Encoding()) 171 return UTF8Encoding(); 172 return *this; 173} 174 175const TextEncoding& ASCIIEncoding() 176{ 177 static TextEncoding globalASCIIEncoding("ASCII"); 178 return globalASCIIEncoding; 179} 180 181const TextEncoding& Latin1Encoding() 182{ 183 static TextEncoding globalLatin1Encoding("latin1"); 184 return globalLatin1Encoding; 185} 186 187const TextEncoding& UTF16BigEndianEncoding() 188{ 189 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 190 return globalUTF16BigEndianEncoding; 191} 192 193const TextEncoding& UTF16LittleEndianEncoding() 194{ 195 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 196 return globalUTF16LittleEndianEncoding; 197} 198 199const TextEncoding& UTF32BigEndianEncoding() 200{ 201 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 202 return globalUTF32BigEndianEncoding; 203} 204 205const TextEncoding& UTF32LittleEndianEncoding() 206{ 207 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 208 return globalUTF32LittleEndianEncoding; 209} 210 211const TextEncoding& UTF8Encoding() 212{ 213 static TextEncoding globalUTF8Encoding("UTF-8"); 214 ASSERT(globalUTF8Encoding.isValid()); 215 return globalUTF8Encoding; 216} 217 218const TextEncoding& WindowsLatin1Encoding() 219{ 220 static TextEncoding globalWindowsLatin1Encoding("WinLatin1"); 221 return globalWindowsLatin1Encoding; 222} 223 224} // namespace WTF 225