1/*
2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "TextEncodingDetector.h"
33
34#include "TextEncoding.h"
35#include <wtf/UnusedParam.h>
36
37#ifndef BUILDING_ON_TIGER
38#include "unicode/ucnv.h"
39#include "unicode/ucsdet.h"
40#endif
41
42namespace WebCore {
43
44bool detectTextEncoding(const char* data, size_t len,
45                        const char* hintEncodingName,
46                        TextEncoding* detectedEncoding)
47{
48    *detectedEncoding = TextEncoding();
49#ifdef BUILDING_ON_TIGER
50    // Tiger came with ICU 3.2 and does not have the encoding detector.
51    UNUSED_PARAM(data);
52    UNUSED_PARAM(len);
53    UNUSED_PARAM(hintEncodingName);
54    return false;
55#else
56    int matchesCount = 0;
57    UErrorCode status = U_ZERO_ERROR;
58    UCharsetDetector* detector = ucsdet_open(&status);
59    if (U_FAILURE(status))
60        return false;
61    ucsdet_enableInputFilter(detector, true);
62    ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
63    if (U_FAILURE(status))
64        return false;
65
66    // FIXME: A few things we can do other than improving
67    // the ICU detector itself.
68    // 1. Use ucsdet_detectAll and pick the most likely one given
69    // "the context" (parent-encoding, referrer encoding, etc).
70    // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
71    // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
72    // encoding with a highest confidence among the detector-specific
73    // limited set of candidate encodings.
74    // Below is a partial implementation of the first part of what's outlined
75    // above.
76    const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
77    if (U_FAILURE(status)) {
78        ucsdet_close(detector);
79        return false;
80    }
81
82    const char* encoding = 0;
83    if (hintEncodingName) {
84        TextEncoding hintEncoding(hintEncodingName);
85        // 10 is the minimum confidence value consistent with the codepoint
86        // allocation in a given encoding. The size of a chunk passed to
87        // us varies even for the same html file (apparently depending on
88        // the network load). When we're given a rather short chunk, we
89        // don't have a sufficiently reliable signal other than the fact that
90        // the chunk is consistent with a set of encodings. So, instead of
91        // setting an arbitrary threshold, we have to scan all the encodings
92        // consistent with the data.
93        const int32_t kThresold = 10;
94        for (int i = 0; i < matchesCount; ++i) {
95            int32_t confidence = ucsdet_getConfidence(matches[i], &status);
96            if (U_FAILURE(status)) {
97                status = U_ZERO_ERROR;
98                continue;
99            }
100            if (confidence < kThresold)
101                break;
102            const char* matchEncoding = ucsdet_getName(matches[i], &status);
103            if (U_FAILURE(status)) {
104                status = U_ZERO_ERROR;
105                continue;
106            }
107            if (TextEncoding(matchEncoding) == hintEncoding) {
108                encoding = hintEncodingName;
109                break;
110            }
111        }
112    }
113    // If no match is found so far, just pick the top match.
114    // This can happen, say, when a parent frame in EUC-JP refers to
115    // a child frame in Shift_JIS and both frames do NOT specify the encoding
116    // making us resort to auto-detection (when it IS turned on).
117    if (!encoding && matchesCount > 0)
118        encoding = ucsdet_getName(matches[0], &status);
119    if (U_SUCCESS(status)) {
120        *detectedEncoding = TextEncoding(encoding);
121        ucsdet_close(detector);
122        return true;
123    }
124    ucsdet_close(detector);
125    return false;
126#endif
127}
128
129}
130