18f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian/* 28f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * Copyright (C) 2008, 2009 Google Inc. All rights reserved. 38f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * 48f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * Redistribution and use in source and binary forms, with or without 58f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * modification, are permitted provided that the following conditions are 68f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * met: 78f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * 88f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * * Redistributions of source code must retain the above copyright 98f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * notice, this list of conditions and the following disclaimer. 108f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * * Redistributions in binary form must reproduce the above 118f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * copyright notice, this list of conditions and the following disclaimer 128f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * in the documentation and/or other materials provided with the 138f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * distribution. 148f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * * Neither the name of Google Inc. nor the names of its 158f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * contributors may be used to endorse or promote products derived from 168f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * this software without specific prior written permission. 178f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * 188f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 198f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 208f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 218f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 228f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 238f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 248f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 258f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 268f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 278f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 288f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 298f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian */ 308f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 318f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "config.h" 328f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "TextEncodingDetector.h" 338f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 348f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "TextEncoding.h" 355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian#include <wtf/UnusedParam.h> 368f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 378f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#ifndef BUILDING_ON_TIGER 388f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "unicode/ucnv.h" 398f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "unicode/ucsdet.h" 408f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#endif 418f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 428f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qiannamespace WebCore { 438f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 448f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qianbool detectTextEncoding(const char* data, size_t len, 458f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian const char* hintEncodingName, 468f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian TextEncoding* detectedEncoding) 478f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian{ 488f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *detectedEncoding = TextEncoding(); 498f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#ifdef BUILDING_ON_TIGER 508f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // Tiger came with ICU 3.2 and does not have the encoding detector. 518f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian UNUSED_PARAM(data); 528f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian UNUSED_PARAM(len); 538f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian UNUSED_PARAM(hintEncodingName); 548f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian return false; 558f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#else 568f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian int matchesCount = 0; 578f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian UErrorCode status = U_ZERO_ERROR; 588f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian UCharsetDetector* detector = ucsdet_open(&status); 598f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (U_FAILURE(status)) 608f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian return false; 618f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian ucsdet_enableInputFilter(detector, true); 628f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); 638f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (U_FAILURE(status)) 648f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian return false; 658f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 668f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // FIXME: A few things we can do other than improving 678f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // the ICU detector itself. 688f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // 1. Use ucsdet_detectAll and pick the most likely one given 698f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // "the context" (parent-encoding, referrer encoding, etc). 708f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. 718f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // Chinese, Japanese, Russian, Korean and Hebrew) by picking the 72d0825bca7fe65beaee391d30da42e937db621564Steve Block // encoding with a highest confidence among the detector-specific 738f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // limited set of candidate encodings. 748f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // Below is a partial implementation of the first part of what's outlined 758f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // above. 768f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); 778f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (U_FAILURE(status)) { 788f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian ucsdet_close(detector); 798f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian return false; 808f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 818f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 828f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian const char* encoding = 0; 838f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (hintEncodingName) { 848f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian TextEncoding hintEncoding(hintEncodingName); 858f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // 10 is the minimum confidence value consistent with the codepoint 868f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // allocation in a given encoding. The size of a chunk passed to 878f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // us varies even for the same html file (apparently depending on 888f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // the network load). When we're given a rather short chunk, we 898f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // don't have a sufficiently reliable signal other than the fact that 908f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // the chunk is consistent with a set of encodings. So, instead of 918f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // setting an arbitrary threshold, we have to scan all the encodings 928f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // consistent with the data. 938f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian const int32_t kThresold = 10; 948f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian for (int i = 0; i < matchesCount; ++i) { 958f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian int32_t confidence = ucsdet_getConfidence(matches[i], &status); 968f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (U_FAILURE(status)) { 978f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian status = U_ZERO_ERROR; 988f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian continue; 998f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 1008f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (confidence < kThresold) 1018f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian break; 1028f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian const char* matchEncoding = ucsdet_getName(matches[i], &status); 1038f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (U_FAILURE(status)) { 1048f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian status = U_ZERO_ERROR; 1058f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian continue; 1068f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 1078f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (TextEncoding(matchEncoding) == hintEncoding) { 1088f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian encoding = hintEncodingName; 1098f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian break; 1108f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 1118f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 1128f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 1138f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // If no match is found so far, just pick the top match. 1148f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // This can happen, say, when a parent frame in EUC-JP refers to 1158f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // a child frame in Shift_JIS and both frames do NOT specify the encoding 1168f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian // making us resort to auto-detection (when it IS turned on). 1178f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (!encoding && matchesCount > 0) 1188f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian encoding = ucsdet_getName(matches[0], &status); 1198f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian if (U_SUCCESS(status)) { 1208f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *detectedEncoding = TextEncoding(encoding); 1218f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian ucsdet_close(detector); 1228f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian return true; 1238f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian } 1248f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian ucsdet_close(detector); 1258f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian return false; 1268f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#endif 1278f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian} 1288f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian 1298f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian} 130