1/*
2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "TextEncodingRegistry.h"
29
30#include "TextCodecLatin1.h"
31#include "TextCodecUserDefined.h"
32#include "TextCodecUTF16.h"
33#include "TextCodecUTF8.h"
34#include "TextEncoding.h"
35#include <wtf/ASCIICType.h>
36#include <wtf/HashMap.h>
37#include <wtf/HashSet.h>
38#include <wtf/StdLibExtras.h>
39#include <wtf/StringExtras.h>
40#include <wtf/Threading.h>
41
42#if USE(ICU_UNICODE)
43#include "TextCodecICU.h"
44#endif
45#if PLATFORM(MAC)
46#include "TextCodecMac.h"
47#endif
48#if PLATFORM(QT)
49#include "qt/TextCodecQt.h"
50#endif
51#if USE(GLIB_UNICODE)
52#include "gtk/TextCodecGtk.h"
53#endif
54#if USE(BREWMP_UNICODE)
55#include "brew/TextCodecBrew.h"
56#endif
57#if OS(WINCE) && !PLATFORM(QT)
58#include "TextCodecWinCE.h"
59#endif
60
61#include <wtf/CurrentTime.h>
62#include <wtf/text/CString.h>
63
64using namespace WTF;
65
66namespace WebCore {
67
68const size_t maxEncodingNameLength = 63;
69
70// Hash for all-ASCII strings that does case folding.
71struct TextEncodingNameHash {
72    static bool equal(const char* s1, const char* s2)
73    {
74        char c1;
75        char c2;
76        do {
77            c1 = *s1++;
78            c2 = *s2++;
79            if (toASCIILower(c1) != toASCIILower(c2))
80                return false;
81        } while (c1 && c2);
82        return !c1 && !c2;
83    }
84
85    // This algorithm is the one-at-a-time hash from:
86    // http://burtleburtle.net/bob/hash/hashfaq.html
87    // http://burtleburtle.net/bob/hash/doobs.html
88    static unsigned hash(const char* s)
89    {
90        unsigned h = WTF::stringHashingStartValue;
91        for (;;) {
92            char c = *s++;
93            if (!c) {
94                h += (h << 3);
95                h ^= (h >> 11);
96                h += (h << 15);
97                return h;
98            }
99            h += toASCIILower(c);
100            h += (h << 10);
101            h ^= (h >> 6);
102        }
103    }
104
105    static const bool safeToCompareToEmptyOrDeleted = false;
106};
107
108struct TextCodecFactory {
109    NewTextCodecFunction function;
110    const void* additionalData;
111    TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
112};
113
114typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
115typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
116
117static Mutex& encodingRegistryMutex()
118{
119    // We don't have to use AtomicallyInitializedStatic here because
120    // this function is called on the main thread for any page before
121    // it is used in worker threads.
122    DEFINE_STATIC_LOCAL(Mutex, mutex, ());
123    return mutex;
124}
125
126static TextEncodingNameMap* textEncodingNameMap;
127static TextCodecMap* textCodecMap;
128static bool didExtendTextCodecMaps;
129static HashSet<const char*>* japaneseEncodings;
130static HashSet<const char*>* nonBackslashEncodings;
131
132static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
133
134#if ERROR_DISABLED
135
136static inline void checkExistingName(const char*, const char*) { }
137
138#else
139
140static void checkExistingName(const char* alias, const char* atomicName)
141{
142    const char* oldAtomicName = textEncodingNameMap->get(alias);
143    if (!oldAtomicName)
144        return;
145    if (oldAtomicName == atomicName)
146        return;
147    // Keep the warning silent about one case where we know this will happen.
148    if (strcmp(alias, "ISO-8859-8-I") == 0
149            && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
150            && strcasecmp(atomicName, "iso-8859-8") == 0)
151        return;
152    LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
153}
154
155#endif
156
157static bool isUndesiredAlias(const char* alias)
158{
159    // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
160    for (const char* p = alias; *p; ++p) {
161        if (*p == ',')
162            return true;
163    }
164    // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
165    // problem, see bug 43554.
166    if (0 == strcmp(alias, "8859_1"))
167        return true;
168    return false;
169}
170
171static void addToTextEncodingNameMap(const char* alias, const char* name)
172{
173    ASSERT(strlen(alias) <= maxEncodingNameLength);
174    if (isUndesiredAlias(alias))
175        return;
176    const char* atomicName = textEncodingNameMap->get(name);
177    ASSERT(strcmp(alias, name) == 0 || atomicName);
178    if (!atomicName)
179        atomicName = name;
180    checkExistingName(alias, atomicName);
181    textEncodingNameMap->add(alias, atomicName);
182}
183
184static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
185{
186    const char* atomicName = textEncodingNameMap->get(name);
187    ASSERT(atomicName);
188    textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
189}
190
191static void pruneBlacklistedCodecs()
192{
193    for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
194        const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
195        if (!atomicName)
196            continue;
197
198        Vector<const char*> names;
199        TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
200        TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
201        for (; it != end; ++it) {
202            if (it->second == atomicName)
203                names.append(it->first);
204        }
205
206        size_t length = names.size();
207        for (size_t j = 0; j < length; ++j)
208            textEncodingNameMap->remove(names[j]);
209
210        textCodecMap->remove(atomicName);
211    }
212}
213
214static void buildBaseTextCodecMaps()
215{
216    ASSERT(isMainThread());
217    ASSERT(!textCodecMap);
218    ASSERT(!textEncodingNameMap);
219
220    textCodecMap = new TextCodecMap;
221    textEncodingNameMap = new TextEncodingNameMap;
222
223    TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
224    TextCodecLatin1::registerCodecs(addToTextCodecMap);
225
226    TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
227    TextCodecUTF8::registerCodecs(addToTextCodecMap);
228
229    TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
230    TextCodecUTF16::registerCodecs(addToTextCodecMap);
231
232    TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
233    TextCodecUserDefined::registerCodecs(addToTextCodecMap);
234
235#if USE(GLIB_UNICODE)
236    // FIXME: This is not needed. The code above covers all the base codecs.
237    TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap);
238    TextCodecGtk::registerBaseCodecs(addToTextCodecMap);
239#endif
240}
241
242static void addEncodingName(HashSet<const char*>* set, const char* name)
243{
244    // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
245    const char* atomicName = textEncodingNameMap->get(name);
246    if (atomicName)
247        set->add(atomicName);
248}
249
250static void buildQuirksSets()
251{
252    // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
253    // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
254
255    ASSERT(!japaneseEncodings);
256    ASSERT(!nonBackslashEncodings);
257
258    japaneseEncodings = new HashSet<const char*>;
259    addEncodingName(japaneseEncodings, "EUC-JP");
260    addEncodingName(japaneseEncodings, "ISO-2022-JP");
261    addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
262    addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
263    addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
264    addEncodingName(japaneseEncodings, "JIS_C6226-1978");
265    addEncodingName(japaneseEncodings, "JIS_X0201");
266    addEncodingName(japaneseEncodings, "JIS_X0208-1983");
267    addEncodingName(japaneseEncodings, "JIS_X0208-1990");
268    addEncodingName(japaneseEncodings, "JIS_X0212-1990");
269    addEncodingName(japaneseEncodings, "Shift_JIS");
270    addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
271    addEncodingName(japaneseEncodings, "cp932");
272    addEncodingName(japaneseEncodings, "x-mac-japanese");
273
274    nonBackslashEncodings = new HashSet<const char*>;
275    // The text encodings below treat backslash as a currency symbol for IE compatibility.
276    // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
277    addEncodingName(nonBackslashEncodings, "x-mac-japanese");
278    addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
279    addEncodingName(nonBackslashEncodings, "EUC-JP");
280    // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
281    addEncodingName(nonBackslashEncodings, "Shift_JIS");
282    addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
283}
284
285bool isJapaneseEncoding(const char* canonicalEncodingName)
286{
287    return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
288}
289
290bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
291{
292    return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
293}
294
295static void extendTextCodecMaps()
296{
297#if USE(ICU_UNICODE)
298    TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
299    TextCodecICU::registerCodecs(addToTextCodecMap);
300#endif
301
302#if USE(QT4_UNICODE)
303    TextCodecQt::registerEncodingNames(addToTextEncodingNameMap);
304    TextCodecQt::registerCodecs(addToTextCodecMap);
305#endif
306
307#if PLATFORM(MAC)
308    TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
309    TextCodecMac::registerCodecs(addToTextCodecMap);
310#endif
311
312#if USE(GLIB_UNICODE)
313    TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap);
314    TextCodecGtk::registerExtendedCodecs(addToTextCodecMap);
315#endif
316
317#if OS(WINCE) && !PLATFORM(QT)
318    TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap);
319    TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap);
320#endif
321
322    pruneBlacklistedCodecs();
323    buildQuirksSets();
324}
325
326PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
327{
328    MutexLocker lock(encodingRegistryMutex());
329
330    ASSERT(textCodecMap);
331    TextCodecFactory factory = textCodecMap->get(encoding.name());
332    ASSERT(factory.function);
333    return factory.function(encoding, factory.additionalData);
334}
335
336const char* atomicCanonicalTextEncodingName(const char* name)
337{
338    if (!name || !name[0])
339        return 0;
340    if (!textEncodingNameMap)
341        buildBaseTextCodecMaps();
342
343    MutexLocker lock(encodingRegistryMutex());
344
345    if (const char* atomicName = textEncodingNameMap->get(name))
346        return atomicName;
347    if (didExtendTextCodecMaps)
348        return 0;
349    extendTextCodecMaps();
350    didExtendTextCodecMaps = true;
351    return textEncodingNameMap->get(name);
352}
353
354const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length)
355{
356    char buffer[maxEncodingNameLength + 1];
357    size_t j = 0;
358    for (size_t i = 0; i < length; ++i) {
359        UChar c = characters[i];
360        if (j == maxEncodingNameLength)
361            return 0;
362        buffer[j++] = c;
363    }
364    buffer[j] = 0;
365    return atomicCanonicalTextEncodingName(buffer);
366}
367
368bool noExtendedTextEncodingNameUsed()
369{
370    // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
371    return !didExtendTextCodecMaps;
372}
373
374#ifndef NDEBUG
375void dumpTextEncodingNameMap()
376{
377    unsigned size = textEncodingNameMap->size();
378    fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size);
379
380    MutexLocker lock(encodingRegistryMutex());
381
382    TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
383    TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
384    for (; it != end; ++it)
385        fprintf(stderr, "'%s' => '%s'\n", it->first, it->second);
386}
387#endif
388
389} // namespace WebCore
390