1/* 2 * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved. 3 * Copyright (C) 2010-2011 Patrick Gansterer <paroga@paroga.com> 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * This library is distributed in the hope that i will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Library General Public License for more details. 18 * 19 * You should have received a copy of the GNU Library General Public License 20 * along with this library; see the file COPYING.LIB. If not, write to 21 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 22 * Boston, MA 02110-1301, USA. 23 */ 24 25#include "config.h" 26#include "TextCodecWinCE.h" 27 28#include "FontCache.h" 29#include "PlatformString.h" 30#include <mlang.h> 31#include <winbase.h> 32#include <winnls.h> 33#include <wtf/HashMap.h> 34#include <wtf/HashSet.h> 35#include <wtf/text/CString.h> 36#include <wtf/text/StringConcatenate.h> 37#include <wtf/text/StringHash.h> 38 39namespace WebCore { 40 41struct CharsetInfo { 42 CString m_name; 43 String m_friendlyName; 44 UINT m_codePage; 45 Vector<CString> m_aliases; 46}; 47 48class LanguageManager { 49private: 50 LanguageManager(); 51 52 friend LanguageManager& languageManager(); 53}; 54 55// Usage: a lookup table used to get CharsetInfo with code page ID. 56// Key: code page ID. Value: charset information. 57static HashMap<UINT, CString>& codePageCharsets() 58{ 59 static HashMap<UINT, CString> cc; 60 return cc; 61} 62 63static HashMap<String, CharsetInfo>& knownCharsets() 64{ 65 static HashMap<String, CharsetInfo> kc; 66 return kc; 67} 68 69// Usage: a map that stores charsets that are supported by system. Sorted by name. 70// Key: charset. Value: code page ID. 71typedef HashSet<String> CharsetSet; 72static CharsetSet& supportedCharsets() 73{ 74 static CharsetSet sl; 75 return sl; 76} 77 78static LanguageManager& languageManager() 79{ 80 static LanguageManager lm; 81 return lm; 82} 83 84LanguageManager::LanguageManager() 85{ 86 IEnumCodePage* enumInterface; 87 IMultiLanguage* mli = FontCache::getMultiLanguageInterface(); 88 if (mli && S_OK == mli->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)) { 89 MIMECPINFO cpInfo; 90 ULONG ccpInfo; 91 while (S_OK == enumInterface->Next(1, &cpInfo, &ccpInfo) && ccpInfo) { 92 if (!IsValidCodePage(cpInfo.uiCodePage)) 93 continue; 94 95 HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage); 96 97 CString name(String(cpInfo.wszWebCharset).latin1()); 98 if (i == codePageCharsets().end()) { 99 CharsetInfo info; 100 info.m_codePage = cpInfo.uiCodePage; 101 knownCharsets().set(name.data(), info); 102 i = codePageCharsets().set(cpInfo.uiCodePage, name).first; 103 } 104 if (i != codePageCharsets().end()) { 105 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->second.data(), i->second.length())); 106 ASSERT(j != knownCharsets().end()); 107 CharsetInfo& info = j->second; 108 info.m_name = i->second.data(); 109 info.m_friendlyName = cpInfo.wszDescription; 110 info.m_aliases.append(name); 111 info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1()); 112 info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1()); 113 String cpName = makeString("cp", String::number(cpInfo.uiCodePage)); 114 info.m_aliases.append(cpName.latin1()); 115 supportedCharsets().add(i->second.data()); 116 } 117 } 118 enumInterface->Release(); 119 } 120} 121 122static UINT getCodePage(const char* name) 123{ 124 // Explicitly use a "const" reference to fix the silly VS build error 125 // saying "==" is not found for const_iterator and iterator 126 const HashMap<String, CharsetInfo>& charsets = knownCharsets(); 127 HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name); 128 return i == charsets.end() ? CP_ACP : i->second.m_codePage; 129} 130 131static PassOwnPtr<TextCodec> newTextCodecWinCE(const TextEncoding& encoding, const void*) 132{ 133 return new TextCodecWinCE(getCodePage(encoding.name())); 134} 135 136TextCodecWinCE::TextCodecWinCE(UINT codePage) 137 : m_codePage(codePage) 138{ 139} 140 141TextCodecWinCE::~TextCodecWinCE() 142{ 143} 144 145void TextCodecWinCE::registerExtendedEncodingNames(EncodingNameRegistrar registrar) 146{ 147 languageManager(); 148 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { 149 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); 150 if (j != knownCharsets().end()) { 151 registrar(j->second.m_name.data(), j->second.m_name.data()); 152 for (Vector<CString>::const_iterator alias = j->second.m_aliases.begin(); alias != j->second.m_aliases.end(); ++alias) 153 registrar(alias->data(), j->second.m_name.data()); 154 } 155 } 156} 157 158void TextCodecWinCE::registerExtendedCodecs(TextCodecRegistrar registrar) 159{ 160 languageManager(); 161 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { 162 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); 163 if (j != knownCharsets().end()) 164 registrar(j->second.m_name.data(), newTextCodecWinCE, 0); 165 } 166} 167 168static DWORD getCodePageFlags(UINT codePage) 169{ 170 if (codePage == 42) // Symbol 171 return 0; 172 173 // Microsoft says the flag must be 0 for the following code pages 174 if (codePage > 50000) { 175 if ((codePage >= 50220 && codePage <= 50222) 176 || codePage == 50225 177 || codePage == 50227 178 || codePage == 50229 179 || codePage == 52936 180 || codePage == 54936 181 || (codePage >= 57002 && codePage <= 57001) 182 || codePage == 65000 // UTF-7 183 ) 184 return 0; 185 } 186 187 return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS; 188} 189 190static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length) 191{ 192 for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) { 193 if (*bytes & 0x80) 194 break; 195 } 196 return bytes; 197} 198 199static void decodeInternal(Vector<UChar, 8192>& result, UINT codePage, const char* bytes, size_t length, size_t* left) 200{ 201 *left = length; 202 if (!bytes || !length) 203 return; 204 205 DWORD flags = getCodePageFlags(codePage); 206 207 int testLength = length; 208 int untestedLength = length; 209 for (;;) { 210 int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0); 211 212 if (resultLength > 0) { 213 int oldSize = result.size(); 214 result.resize(oldSize + resultLength); 215 216 MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength); 217 218 if (testLength == untestedLength) { 219 *left = length - testLength; 220 break; 221 } 222 untestedLength -= testLength; 223 length -= testLength; 224 bytes += testLength; 225 } else { 226 untestedLength = testLength - 1; 227 if (!untestedLength) { 228 *left = length; 229 break; 230 } 231 } 232 testLength = (untestedLength + 1) / 2; 233 } 234} 235 236String TextCodecWinCE::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 237{ 238 if (!m_decodeBuffer.isEmpty()) { 239 m_decodeBuffer.append(bytes, length); 240 bytes = m_decodeBuffer.data(); 241 length = m_decodeBuffer.size(); 242 } 243 244 size_t left; 245 Vector<UChar, 8192> result; 246 for (;;) { 247 decodeInternal(result, m_codePage, bytes, length, &left); 248 if (!left) 249 break; 250 251 if (!flush && left < 16) 252 break; 253 254 result.append(L'?'); 255 sawError = true; 256 if (stopOnError) 257 return String::adopt(result); 258 259 if (left == 1) 260 break; 261 262 bytes += length - left + 1; 263 length = left - 1; 264 } 265 if (left && !flush) { 266 if (m_decodeBuffer.isEmpty()) 267 m_decodeBuffer.append(bytes + length - left, left); 268 else { 269 memmove(m_decodeBuffer.data(), bytes + length - left, left); 270 m_decodeBuffer.resize(left); 271 } 272 } else 273 m_decodeBuffer.clear(); 274 275 return String::adopt(result); 276} 277 278CString TextCodecWinCE::encode(const UChar* characters, size_t length, UnencodableHandling) 279{ 280 if (!characters || !length) 281 return CString(); 282 283 int resultLength = WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, 0, 0, 0, 0); 284 285 // FIXME: We need to implement UnencodableHandling: QuestionMarksForUnencodables, EntitiesForUnencodables, and URLEncodedEntitiesForUnencodables. 286 287 if (resultLength <= 0) 288 return "?"; 289 290 char* characterBuffer; 291 CString result = CString::newUninitialized(resultLength, characterBuffer); 292 293 WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, characterBuffer, resultLength, 0, 0); 294 295 return result; 296} 297 298void TextCodecWinCE::enumerateSupportedEncodings(EncodingReceiver& receiver) 299{ 300 languageManager(); 301 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { 302 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); 303 if (j != knownCharsets().end() && !receiver.receive(j->second.m_name.data(), j->second.m_friendlyName.charactersWithNullTermination(), j->second.m_codePage)) 304 break; 305 } 306} 307 308} // namespace WebCore 309