1/*
2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "wtf/text/TextEncodingRegistry.h"
29
30#include "wtf/ASCIICType.h"
31#include "wtf/CurrentTime.h"
32#include "wtf/HashMap.h"
33#include "wtf/HashSet.h"
34#include "wtf/MainThread.h"
35#include "wtf/StdLibExtras.h"
36#include "wtf/StringExtras.h"
37#include "wtf/ThreadingPrimitives.h"
38#include "wtf/text/CString.h"
39#include "wtf/text/TextCodecICU.h"
40#include "wtf/text/TextCodecLatin1.h"
41#include "wtf/text/TextCodecReplacement.h"
42#include "wtf/text/TextCodecUTF16.h"
43#include "wtf/text/TextCodecUTF8.h"
44#include "wtf/text/TextCodecUserDefined.h"
45#include "wtf/text/TextEncoding.h"
46
47namespace WTF {
48
49const size_t maxEncodingNameLength = 63;
50
51// Hash for all-ASCII strings that does case folding.
52struct TextEncodingNameHash {
53    static bool equal(const char* s1, const char* s2)
54    {
55        char c1;
56        char c2;
57        do {
58#if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
59            // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
60            // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
61            c1 = toASCIILower(*s1++);
62            c2 = toASCIILower(*s2++);
63            if (c1 != c2)
64                return false;
65#else
66            c1 = *s1++;
67            c2 = *s2++;
68            if (toASCIILower(c1) != toASCIILower(c2))
69                return false;
70#endif
71        } while (c1 && c2);
72        return !c1 && !c2;
73    }
74
75    // This algorithm is the one-at-a-time hash from:
76    // http://burtleburtle.net/bob/hash/hashfaq.html
77    // http://burtleburtle.net/bob/hash/doobs.html
78    static unsigned hash(const char* s)
79    {
80        unsigned h = WTF::stringHashingStartValue;
81        for (;;) {
82            char c = *s++;
83            if (!c) {
84                h += (h << 3);
85                h ^= (h >> 11);
86                h += (h << 15);
87                return h;
88            }
89            h += toASCIILower(c);
90            h += (h << 10);
91            h ^= (h >> 6);
92        }
93    }
94
95    static const bool safeToCompareToEmptyOrDeleted = false;
96};
97
98struct TextCodecFactory {
99    NewTextCodecFunction function;
100    const void* additionalData;
101    TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
102};
103
104typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
105typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
106
107static Mutex& encodingRegistryMutex()
108{
109    // We don't have to use AtomicallyInitializedStatic here because
110    // this function is called on the main thread for any page before
111    // it is used in worker threads.
112    DEFINE_STATIC_LOCAL(Mutex, mutex, ());
113    return mutex;
114}
115
116static TextEncodingNameMap* textEncodingNameMap;
117static TextCodecMap* textCodecMap;
118static bool didExtendTextCodecMaps;
119
120static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
121
122#if ERROR_DISABLED
123
124static inline void checkExistingName(const char*, const char*) { }
125
126#else
127
128static void checkExistingName(const char* alias, const char* atomicName)
129{
130    const char* oldAtomicName = textEncodingNameMap->get(alias);
131    if (!oldAtomicName)
132        return;
133    if (oldAtomicName == atomicName)
134        return;
135    // Keep the warning silent about one case where we know this will happen.
136    if (strcmp(alias, "ISO-8859-8-I") == 0
137            && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
138            && strcasecmp(atomicName, "iso-8859-8") == 0)
139        return;
140    WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
141}
142
143#endif
144
145static bool isUndesiredAlias(const char* alias)
146{
147    // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
148    for (const char* p = alias; *p; ++p) {
149        if (*p == ',')
150            return true;
151    }
152    // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
153    // problem, see bug 43554.
154    if (0 == strcmp(alias, "8859_1"))
155        return true;
156    return false;
157}
158
159static void addToTextEncodingNameMap(const char* alias, const char* name)
160{
161    ASSERT(strlen(alias) <= maxEncodingNameLength);
162    if (isUndesiredAlias(alias))
163        return;
164    const char* atomicName = textEncodingNameMap->get(name);
165    ASSERT(strcmp(alias, name) == 0 || atomicName);
166    if (!atomicName)
167        atomicName = name;
168    checkExistingName(alias, atomicName);
169    textEncodingNameMap->add(alias, atomicName);
170}
171
172static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
173{
174    const char* atomicName = textEncodingNameMap->get(name);
175    ASSERT(atomicName);
176    textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
177}
178
179static void pruneBlacklistedCodecs()
180{
181    for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
182        const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
183        if (!atomicName)
184            continue;
185
186        Vector<const char*> names;
187        TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
188        TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
189        for (; it != end; ++it) {
190            if (it->value == atomicName)
191                names.append(it->key);
192        }
193
194        textEncodingNameMap->removeAll(names);
195
196        textCodecMap->remove(atomicName);
197    }
198}
199
200static void buildBaseTextCodecMaps()
201{
202    ASSERT(isMainThread());
203    ASSERT(!textCodecMap);
204    ASSERT(!textEncodingNameMap);
205
206    textCodecMap = new TextCodecMap;
207    textEncodingNameMap = new TextEncodingNameMap;
208
209    TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
210    TextCodecLatin1::registerCodecs(addToTextCodecMap);
211
212    TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
213    TextCodecUTF8::registerCodecs(addToTextCodecMap);
214
215    TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
216    TextCodecUTF16::registerCodecs(addToTextCodecMap);
217
218    TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
219    TextCodecUserDefined::registerCodecs(addToTextCodecMap);
220}
221
222bool isReplacementEncoding(const char* alias)
223{
224    return alias && !strcasecmp(alias, "replacement");
225}
226
227bool isReplacementEncoding(const String& alias)
228{
229    return alias == "replacement";
230}
231
232static void extendTextCodecMaps()
233{
234    TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
235    TextCodecReplacement::registerCodecs(addToTextCodecMap);
236
237    TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
238    TextCodecICU::registerCodecs(addToTextCodecMap);
239
240    pruneBlacklistedCodecs();
241}
242
243PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
244{
245    MutexLocker lock(encodingRegistryMutex());
246
247    ASSERT(textCodecMap);
248    TextCodecFactory factory = textCodecMap->get(encoding.name());
249    ASSERT(factory.function);
250    return factory.function(encoding, factory.additionalData);
251}
252
253const char* atomicCanonicalTextEncodingName(const char* name)
254{
255    if (!name || !name[0])
256        return 0;
257    if (!textEncodingNameMap)
258        buildBaseTextCodecMaps();
259
260    MutexLocker lock(encodingRegistryMutex());
261
262    if (const char* atomicName = textEncodingNameMap->get(name))
263        return atomicName;
264    if (didExtendTextCodecMaps)
265        return 0;
266    extendTextCodecMaps();
267    didExtendTextCodecMaps = true;
268    return textEncodingNameMap->get(name);
269}
270
271template <typename CharacterType>
272const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
273{
274    char buffer[maxEncodingNameLength + 1];
275    size_t j = 0;
276    for (size_t i = 0; i < length; ++i) {
277        CharacterType c = characters[i];
278        if (j == maxEncodingNameLength)
279            return 0;
280        buffer[j++] = c;
281    }
282    buffer[j] = 0;
283    return atomicCanonicalTextEncodingName(buffer);
284}
285
286const char* atomicCanonicalTextEncodingName(const String& alias)
287{
288    if (!alias.length())
289        return 0;
290
291    if (alias.is8Bit())
292        return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
293
294    return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
295}
296
297bool noExtendedTextEncodingNameUsed()
298{
299    // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
300    return !didExtendTextCodecMaps;
301}
302
303#ifndef NDEBUG
304void dumpTextEncodingNameMap()
305{
306    unsigned size = textEncodingNameMap->size();
307    fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
308
309    MutexLocker lock(encodingRegistryMutex());
310
311    TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
312    TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
313    for (; it != end; ++it)
314        fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
315}
316#endif
317
318} // namespace WTF
319