15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)/*
25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Redistribution and use in source and binary forms, with or without
55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * modification, are permitted provided that the following conditions are
65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * met:
75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *     * Redistributions of source code must retain the above copyright
95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * notice, this list of conditions and the following disclaimer.
105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *     * Redistributions in binary form must reproduce the above
115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * copyright notice, this list of conditions and the following disclaimer
125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * in the documentation and/or other materials provided with the
135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * distribution.
145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *     * Neither the name of Google Inc. nor the names of its
155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * contributors may be used to endorse or promote products derived from
165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * this software without specific prior written permission.
175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) */
305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "config.h"
321e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)#include "platform/text/TextEncodingDetector.h"
335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3481a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include "wtf/text/TextEncoding.h"
3581a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include <unicode/ucnv.h>
3681a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include <unicode/ucsdet.h>
375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
38c1847b1379d12d0e05df27436bf19a9b1bf12deaTorne (Richard Coles)namespace blink {
395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
401e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)bool detectTextEncoding(const char* data, size_t length,
411e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)    const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
4381a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)    *detectedEncoding = WTF::TextEncoding();
4402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    int matchesCount = 0;
455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    UErrorCode status = U_ZERO_ERROR;
465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    UCharsetDetector* detector = ucsdet_open(&status);
475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (U_FAILURE(status))
485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return false;
495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ucsdet_enableInputFilter(detector, true);
501e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)    ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);
515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (U_FAILURE(status))
525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return false;
535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // FIXME: A few things we can do other than improving
5502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    // the ICU detector itself.
565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // 1. Use ucsdet_detectAll and pick the most likely one given
575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // "the context" (parent-encoding, referrer encoding, etc).
585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
5902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // encoding with a highest confidence among the detector-specific
615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // limited set of candidate encodings.
625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Below is a partial implementation of the first part of what's outlined
635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // above.
645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (U_FAILURE(status)) {
665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        ucsdet_close(detector);
675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return false;
685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    const char* encoding = 0;
715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (hintEncodingName) {
7281a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)        WTF::TextEncoding hintEncoding(hintEncodingName);
735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // 10 is the minimum confidence value consistent with the codepoint
745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // allocation in a given encoding. The size of a chunk passed to
7502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        // us varies even for the same html file (apparently depending on
7602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        // the network load). When we're given a rather short chunk, we
775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // don't have a sufficiently reliable signal other than the fact that
785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // the chunk is consistent with a set of encodings. So, instead of
795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // setting an arbitrary threshold, we have to scan all the encodings
8002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        // consistent with the data.
815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const int32_t kThresold = 10;
825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        for (int i = 0; i < matchesCount; ++i) {
835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            int32_t confidence = ucsdet_getConfidence(matches[i], &status);
845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (U_FAILURE(status)) {
855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                status = U_ZERO_ERROR;
865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                continue;
875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (confidence < kThresold)
895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                break;
905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            const char* matchEncoding = ucsdet_getName(matches[i], &status);
915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (U_FAILURE(status)) {
925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                status = U_ZERO_ERROR;
935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                continue;
945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
9581a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)            if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                encoding = hintEncodingName;
975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                break;
985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
10102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    // If no match is found so far, just pick the top match.
1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // This can happen, say, when a parent frame in EUC-JP refers to
1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // a child frame in Shift_JIS and both frames do NOT specify the encoding
1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // making us resort to auto-detection (when it IS turned on).
1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (!encoding && matchesCount > 0)
1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        encoding = ucsdet_getName(matches[0], &status);
1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (U_SUCCESS(status)) {
10881a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)        *detectedEncoding = WTF::TextEncoding(encoding);
1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        ucsdet_close(detector);
1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return true;
11102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    }
1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ucsdet_close(detector);
1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
117