1/* 2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2007-2009 Torch Mobile, Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "config.h" 28#include "wtf/text/TextEncodingRegistry.h" 29 30#include "wtf/ASCIICType.h" 31#include "wtf/CurrentTime.h" 32#include "wtf/HashMap.h" 33#include "wtf/HashSet.h" 34#include "wtf/MainThread.h" 35#include "wtf/StdLibExtras.h" 36#include "wtf/StringExtras.h" 37#include "wtf/ThreadingPrimitives.h" 38#include "wtf/text/CString.h" 39#include "wtf/text/TextCodecICU.h" 40#include "wtf/text/TextCodecLatin1.h" 41#include "wtf/text/TextCodecReplacement.h" 42#include "wtf/text/TextCodecUTF16.h" 43#include "wtf/text/TextCodecUTF8.h" 44#include "wtf/text/TextCodecUserDefined.h" 45#include "wtf/text/TextEncoding.h" 46 47namespace WTF { 48 49const size_t maxEncodingNameLength = 63; 50 51// Hash for all-ASCII strings that does case folding. 52struct TextEncodingNameHash { 53 static bool equal(const char* s1, const char* s2) 54 { 55 char c1; 56 char c2; 57 do { 58#if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106 59 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released. 60 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only 61 c1 = toASCIILower(*s1++); 62 c2 = toASCIILower(*s2++); 63 if (c1 != c2) 64 return false; 65#else 66 c1 = *s1++; 67 c2 = *s2++; 68 if (toASCIILower(c1) != toASCIILower(c2)) 69 return false; 70#endif 71 } while (c1 && c2); 72 return !c1 && !c2; 73 } 74 75 // This algorithm is the one-at-a-time hash from: 76 // http://burtleburtle.net/bob/hash/hashfaq.html 77 // http://burtleburtle.net/bob/hash/doobs.html 78 static unsigned hash(const char* s) 79 { 80 unsigned h = WTF::stringHashingStartValue; 81 for (;;) { 82 char c = *s++; 83 if (!c) { 84 h += (h << 3); 85 h ^= (h >> 11); 86 h += (h << 15); 87 return h; 88 } 89 h += toASCIILower(c); 90 h += (h << 10); 91 h ^= (h >> 6); 92 } 93 } 94 95 static const bool safeToCompareToEmptyOrDeleted = false; 96}; 97 98struct TextCodecFactory { 99 NewTextCodecFunction function; 100 const void* additionalData; 101 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } 102}; 103 104typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; 105typedef HashMap<const char*, TextCodecFactory> TextCodecMap; 106 107static Mutex& encodingRegistryMutex() 108{ 109 // We don't have to use AtomicallyInitializedStatic here because 110 // this function is called on the main thread for any page before 111 // it is used in worker threads. 112 DEFINE_STATIC_LOCAL(Mutex, mutex, ()); 113 return mutex; 114} 115 116static TextEncodingNameMap* textEncodingNameMap; 117static TextCodecMap* textCodecMap; 118static bool didExtendTextCodecMaps; 119 120static const char textEncodingNameBlacklist[][6] = { "UTF-7" }; 121 122#if ERROR_DISABLED 123 124static inline void checkExistingName(const char*, const char*) { } 125 126#else 127 128static void checkExistingName(const char* alias, const char* atomicName) 129{ 130 const char* oldAtomicName = textEncodingNameMap->get(alias); 131 if (!oldAtomicName) 132 return; 133 if (oldAtomicName == atomicName) 134 return; 135 // Keep the warning silent about one case where we know this will happen. 136 if (strcmp(alias, "ISO-8859-8-I") == 0 137 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 138 && strcasecmp(atomicName, "iso-8859-8") == 0) 139 return; 140 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); 141} 142 143#endif 144 145static bool isUndesiredAlias(const char* alias) 146{ 147 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). 148 for (const char* p = alias; *p; ++p) { 149 if (*p == ',') 150 return true; 151 } 152 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility 153 // problem, see bug 43554. 154 if (0 == strcmp(alias, "8859_1")) 155 return true; 156 return false; 157} 158 159static void addToTextEncodingNameMap(const char* alias, const char* name) 160{ 161 ASSERT(strlen(alias) <= maxEncodingNameLength); 162 if (isUndesiredAlias(alias)) 163 return; 164 const char* atomicName = textEncodingNameMap->get(name); 165 ASSERT(strcmp(alias, name) == 0 || atomicName); 166 if (!atomicName) 167 atomicName = name; 168 checkExistingName(alias, atomicName); 169 textEncodingNameMap->add(alias, atomicName); 170} 171 172static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) 173{ 174 const char* atomicName = textEncodingNameMap->get(name); 175 ASSERT(atomicName); 176 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); 177} 178 179static void pruneBlacklistedCodecs() 180{ 181 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { 182 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); 183 if (!atomicName) 184 continue; 185 186 Vector<const char*> names; 187 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 188 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 189 for (; it != end; ++it) { 190 if (it->value == atomicName) 191 names.append(it->key); 192 } 193 194 textEncodingNameMap->removeAll(names); 195 196 textCodecMap->remove(atomicName); 197 } 198} 199 200static void buildBaseTextCodecMaps() 201{ 202 ASSERT(isMainThread()); 203 ASSERT(!textCodecMap); 204 ASSERT(!textEncodingNameMap); 205 206 textCodecMap = new TextCodecMap; 207 textEncodingNameMap = new TextEncodingNameMap; 208 209 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); 210 TextCodecLatin1::registerCodecs(addToTextCodecMap); 211 212 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); 213 TextCodecUTF8::registerCodecs(addToTextCodecMap); 214 215 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); 216 TextCodecUTF16::registerCodecs(addToTextCodecMap); 217 218 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); 219 TextCodecUserDefined::registerCodecs(addToTextCodecMap); 220} 221 222bool isReplacementEncoding(const char* alias) 223{ 224 return alias && !strcasecmp(alias, "replacement"); 225} 226 227bool isReplacementEncoding(const String& alias) 228{ 229 return alias == "replacement"; 230} 231 232static void extendTextCodecMaps() 233{ 234 TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap); 235 TextCodecReplacement::registerCodecs(addToTextCodecMap); 236 237 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); 238 TextCodecICU::registerCodecs(addToTextCodecMap); 239 240 pruneBlacklistedCodecs(); 241} 242 243PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) 244{ 245 MutexLocker lock(encodingRegistryMutex()); 246 247 ASSERT(textCodecMap); 248 TextCodecFactory factory = textCodecMap->get(encoding.name()); 249 ASSERT(factory.function); 250 return factory.function(encoding, factory.additionalData); 251} 252 253const char* atomicCanonicalTextEncodingName(const char* name) 254{ 255 if (!name || !name[0]) 256 return 0; 257 if (!textEncodingNameMap) 258 buildBaseTextCodecMaps(); 259 260 MutexLocker lock(encodingRegistryMutex()); 261 262 if (const char* atomicName = textEncodingNameMap->get(name)) 263 return atomicName; 264 if (didExtendTextCodecMaps) 265 return 0; 266 extendTextCodecMaps(); 267 didExtendTextCodecMaps = true; 268 return textEncodingNameMap->get(name); 269} 270 271template <typename CharacterType> 272const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) 273{ 274 char buffer[maxEncodingNameLength + 1]; 275 size_t j = 0; 276 for (size_t i = 0; i < length; ++i) { 277 CharacterType c = characters[i]; 278 if (j == maxEncodingNameLength) 279 return 0; 280 buffer[j++] = c; 281 } 282 buffer[j] = 0; 283 return atomicCanonicalTextEncodingName(buffer); 284} 285 286const char* atomicCanonicalTextEncodingName(const String& alias) 287{ 288 if (!alias.length()) 289 return 0; 290 291 if (alias.is8Bit()) 292 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length()); 293 294 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length()); 295} 296 297bool noExtendedTextEncodingNameUsed() 298{ 299 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. 300 return !didExtendTextCodecMaps; 301} 302 303#ifndef NDEBUG 304void dumpTextEncodingNameMap() 305{ 306 unsigned size = textEncodingNameMap->size(); 307 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size); 308 309 MutexLocker lock(encodingRegistryMutex()); 310 311 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 312 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 313 for (; it != end; ++it) 314 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value); 315} 316#endif 317 318} // namespace WTF 319