1/*
2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "wtf/text/TextEncoding.h"
30
31#include "wtf/text/TextEncodingRegistry.h"
32#include <unicode/unorm.h>
33#include "wtf/OwnPtr.h"
34#include "wtf/StdLibExtras.h"
35#include "wtf/text/CString.h"
36#include "wtf/text/WTFString.h"
37
38namespace WTF {
39
40static const TextEncoding& UTF7Encoding()
41{
42    static TextEncoding globalUTF7Encoding("UTF-7");
43    return globalUTF7Encoding;
44}
45
46TextEncoding::TextEncoding(const char* name)
47    : m_name(atomicCanonicalTextEncodingName(name))
48{
49    // Aliases are valid, but not "replacement" itself.
50    if (m_name && isReplacementEncoding(name))
51        m_name = 0;
52}
53
54TextEncoding::TextEncoding(const String& name)
55    : m_name(atomicCanonicalTextEncodingName(name))
56{
57    // Aliases are valid, but not "replacement" itself.
58    if (m_name && isReplacementEncoding(name))
59        m_name = 0;
60}
61
62String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
63{
64    if (!m_name)
65        return String();
66
67    return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError);
68}
69
70CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
71{
72    if (!m_name)
73        return CString();
74
75    if (string.isEmpty())
76        return "";
77
78    OwnPtr<TextCodec> textCodec = newTextCodec(*this);
79    CString encodedString;
80    if (string.is8Bit())
81        encodedString = textCodec->encode(string.characters8(), string.length(), handling);
82    else
83        encodedString = textCodec->encode(string.characters16(), string.length(), handling);
84    return encodedString;
85}
86
87CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
88{
89    if (!m_name)
90        return CString();
91
92    if (string.isEmpty())
93        return "";
94
95    // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
96    // unaffected by NFC. This is effectively the same as saying that all
97    // Latin-1 text is already normalized to NFC.
98    // Source: http://unicode.org/reports/tr15/
99    if (string.is8Bit())
100        return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
101
102    const UChar* source = string.characters16();
103    size_t length = string.length();
104
105    Vector<UChar> normalizedCharacters;
106
107    UErrorCode err = U_ZERO_ERROR;
108    if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
109        // First try using the length of the original string, since normalization to NFC rarely increases length.
110        normalizedCharacters.grow(length);
111        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
112        if (err == U_BUFFER_OVERFLOW_ERROR) {
113            err = U_ZERO_ERROR;
114            normalizedCharacters.resize(normalizedLength);
115            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
116        }
117        ASSERT(U_SUCCESS(err));
118
119        source = normalizedCharacters.data();
120        length = normalizedLength;
121    }
122
123    return newTextCodec(*this)->encode(source, length, handling);
124}
125
126bool TextEncoding::usesVisualOrdering() const
127{
128    if (noExtendedTextEncodingNameUsed())
129        return false;
130
131    static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
132    return m_name == a;
133}
134
135bool TextEncoding::isNonByteBasedEncoding() const
136{
137    if (noExtendedTextEncodingNameUsed()) {
138        return *this == UTF16LittleEndianEncoding()
139            || *this == UTF16BigEndianEncoding();
140    }
141
142    return *this == UTF16LittleEndianEncoding()
143        || *this == UTF16BigEndianEncoding()
144        || *this == UTF32BigEndianEncoding()
145        || *this == UTF32LittleEndianEncoding();
146}
147
148bool TextEncoding::isUTF7Encoding() const
149{
150    if (noExtendedTextEncodingNameUsed())
151        return false;
152
153    return *this == UTF7Encoding();
154}
155
156const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
157{
158    if (isNonByteBasedEncoding())
159        return UTF8Encoding();
160    return *this;
161}
162
163// HTML5 specifies that UTF-8 be used in form submission when a form is
164// is a part of a document in UTF-16 probably because UTF-16 is not a
165// byte-based encoding and can contain 0x00. By extension, the same
166// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
167// but it's fraught with problems and we'd rather steer clear of it.
168const TextEncoding& TextEncoding::encodingForFormSubmission() const
169{
170    if (isNonByteBasedEncoding() || isUTF7Encoding())
171        return UTF8Encoding();
172    return *this;
173}
174
175const TextEncoding& ASCIIEncoding()
176{
177    static TextEncoding globalASCIIEncoding("ASCII");
178    return globalASCIIEncoding;
179}
180
181const TextEncoding& Latin1Encoding()
182{
183    static TextEncoding globalLatin1Encoding("latin1");
184    return globalLatin1Encoding;
185}
186
187const TextEncoding& UTF16BigEndianEncoding()
188{
189    static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
190    return globalUTF16BigEndianEncoding;
191}
192
193const TextEncoding& UTF16LittleEndianEncoding()
194{
195    static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
196    return globalUTF16LittleEndianEncoding;
197}
198
199const TextEncoding& UTF32BigEndianEncoding()
200{
201    static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
202    return globalUTF32BigEndianEncoding;
203}
204
205const TextEncoding& UTF32LittleEndianEncoding()
206{
207    static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
208    return globalUTF32LittleEndianEncoding;
209}
210
211const TextEncoding& UTF8Encoding()
212{
213    static TextEncoding globalUTF8Encoding("UTF-8");
214    ASSERT(globalUTF8Encoding.isValid());
215    return globalUTF8Encoding;
216}
217
218const TextEncoding& WindowsLatin1Encoding()
219{
220    static TextEncoding globalWindowsLatin1Encoding("WinLatin1");
221    return globalWindowsLatin1Encoding;
222}
223
224} // namespace WTF
225