18f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian/*
28f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
38f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *
48f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * Redistribution and use in source and binary forms, with or without
58f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * modification, are permitted provided that the following conditions are
68f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * met:
78f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *
88f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *     * Redistributions of source code must retain the above copyright
98f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * notice, this list of conditions and the following disclaimer.
108f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *     * Redistributions in binary form must reproduce the above
118f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * copyright notice, this list of conditions and the following disclaimer
128f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * in the documentation and/or other materials provided with the
138f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * distribution.
148f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *     * Neither the name of Google Inc. nor the names of its
158f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * contributors may be used to endorse or promote products derived from
168f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * this software without specific prior written permission.
178f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian *
188f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
198f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
208f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
218f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
228f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
238f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
248f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
258f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
268f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
278f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
288f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
298f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian */
308f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
318f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "config.h"
328f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "TextEncodingDetector.h"
338f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
348f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "TextEncoding.h"
355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian#include <wtf/UnusedParam.h>
368f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
378f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#ifndef BUILDING_ON_TIGER
388f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "unicode/ucnv.h"
398f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#include "unicode/ucsdet.h"
408f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#endif
418f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
428f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qiannamespace WebCore {
438f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
448f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qianbool detectTextEncoding(const char* data, size_t len,
458f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                        const char* hintEncodingName,
468f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                        TextEncoding* detectedEncoding)
478f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian{
488f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    *detectedEncoding = TextEncoding();
498f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#ifdef BUILDING_ON_TIGER
508f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // Tiger came with ICU 3.2 and does not have the encoding detector.
518f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    UNUSED_PARAM(data);
528f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    UNUSED_PARAM(len);
538f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    UNUSED_PARAM(hintEncodingName);
548f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    return false;
558f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#else
568f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    int matchesCount = 0;
578f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    UErrorCode status = U_ZERO_ERROR;
588f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    UCharsetDetector* detector = ucsdet_open(&status);
598f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    if (U_FAILURE(status))
608f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        return false;
618f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    ucsdet_enableInputFilter(detector, true);
628f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
638f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    if (U_FAILURE(status))
648f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        return false;
658f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
668f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // FIXME: A few things we can do other than improving
678f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // the ICU detector itself.
688f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // 1. Use ucsdet_detectAll and pick the most likely one given
698f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // "the context" (parent-encoding, referrer encoding, etc).
708f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
718f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
72d0825bca7fe65beaee391d30da42e937db621564Steve Block    // encoding with a highest confidence among the detector-specific
738f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // limited set of candidate encodings.
748f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // Below is a partial implementation of the first part of what's outlined
758f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // above.
768f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
778f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    if (U_FAILURE(status)) {
788f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        ucsdet_close(detector);
798f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        return false;
808f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    }
818f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
828f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    const char* encoding = 0;
838f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    if (hintEncodingName) {
848f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        TextEncoding hintEncoding(hintEncodingName);
858f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // 10 is the minimum confidence value consistent with the codepoint
868f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // allocation in a given encoding. The size of a chunk passed to
878f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // us varies even for the same html file (apparently depending on
888f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // the network load). When we're given a rather short chunk, we
898f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // don't have a sufficiently reliable signal other than the fact that
908f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // the chunk is consistent with a set of encodings. So, instead of
918f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // setting an arbitrary threshold, we have to scan all the encodings
928f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        // consistent with the data.
938f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        const int32_t kThresold = 10;
948f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        for (int i = 0; i < matchesCount; ++i) {
958f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            int32_t confidence = ucsdet_getConfidence(matches[i], &status);
968f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            if (U_FAILURE(status)) {
978f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                status = U_ZERO_ERROR;
988f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                continue;
998f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            }
1008f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            if (confidence < kThresold)
1018f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                break;
1028f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            const char* matchEncoding = ucsdet_getName(matches[i], &status);
1038f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            if (U_FAILURE(status)) {
1048f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                status = U_ZERO_ERROR;
1058f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                continue;
1068f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            }
1078f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            if (TextEncoding(matchEncoding) == hintEncoding) {
1088f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                encoding = hintEncodingName;
1098f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian                break;
1108f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian            }
1118f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        }
1128f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    }
1138f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // If no match is found so far, just pick the top match.
1148f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // This can happen, say, when a parent frame in EUC-JP refers to
1158f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // a child frame in Shift_JIS and both frames do NOT specify the encoding
1168f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    // making us resort to auto-detection (when it IS turned on).
1178f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    if (!encoding && matchesCount > 0)
1188f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        encoding = ucsdet_getName(matches[0], &status);
1198f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    if (U_SUCCESS(status)) {
1208f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        *detectedEncoding = TextEncoding(encoding);
1218f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        ucsdet_close(detector);
1228f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian        return true;
1238f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    }
1248f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    ucsdet_close(detector);
1258f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian    return false;
1268f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian#endif
1278f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian}
1288f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian
1298f72e70a9fd78eec56623b3a62e68f16b7b27e28Feng Qian}
130