15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)/* 25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Copyright (C) 2008, 2009 Google Inc. All rights reserved. 35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Redistribution and use in source and binary forms, with or without 55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * modification, are permitted provided that the following conditions are 65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * met: 75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * * Redistributions of source code must retain the above copyright 95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * notice, this list of conditions and the following disclaimer. 105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * * Redistributions in binary form must reproduce the above 115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * copyright notice, this list of conditions and the following disclaimer 125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * in the documentation and/or other materials provided with the 135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * distribution. 145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * * Neither the name of Google Inc. nor the names of its 155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * contributors may be used to endorse or promote products derived from 165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * this software without specific prior written permission. 175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) */ 305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "config.h" 321e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)#include "platform/text/TextEncodingDetector.h" 335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3481a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include "wtf/text/TextEncoding.h" 3581a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include <unicode/ucnv.h> 3681a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles)#include <unicode/ucsdet.h> 375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 38c1847b1379d12d0e05df27436bf19a9b1bf12deaTorne (Richard Coles)namespace blink { 395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 401e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)bool detectTextEncoding(const char* data, size_t length, 411e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles) const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) 425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 4381a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles) *detectedEncoding = WTF::TextEncoding(); 4402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch int matchesCount = 0; 455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) UCharsetDetector* detector = ucsdet_open(&status); 475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (U_FAILURE(status)) 485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ucsdet_enableInputFilter(detector, true); 501e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles) ucsdet_setText(detector, data, static_cast<int32_t>(length), &status); 515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (U_FAILURE(status)) 525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // FIXME: A few things we can do other than improving 5502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // the ICU detector itself. 565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // 1. Use ucsdet_detectAll and pick the most likely one given 575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // "the context" (parent-encoding, referrer encoding, etc). 585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. 5902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // Chinese, Japanese, Russian, Korean and Hebrew) by picking the 605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // encoding with a highest confidence among the detector-specific 615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // limited set of candidate encodings. 625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Below is a partial implementation of the first part of what's outlined 635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // above. 645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); 655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (U_FAILURE(status)) { 665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ucsdet_close(detector); 675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const char* encoding = 0; 715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (hintEncodingName) { 7281a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles) WTF::TextEncoding hintEncoding(hintEncodingName); 735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // 10 is the minimum confidence value consistent with the codepoint 745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // allocation in a given encoding. The size of a chunk passed to 7502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // us varies even for the same html file (apparently depending on 7602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // the network load). When we're given a rather short chunk, we 775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // don't have a sufficiently reliable signal other than the fact that 785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // the chunk is consistent with a set of encodings. So, instead of 795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // setting an arbitrary threshold, we have to scan all the encodings 8002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // consistent with the data. 815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const int32_t kThresold = 10; 825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) for (int i = 0; i < matchesCount; ++i) { 835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int32_t confidence = ucsdet_getConfidence(matches[i], &status); 845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (U_FAILURE(status)) { 855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) status = U_ZERO_ERROR; 865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (confidence < kThresold) 895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) break; 905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const char* matchEncoding = ucsdet_getName(matches[i], &status); 915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (U_FAILURE(status)) { 925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) status = U_ZERO_ERROR; 935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 9581a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles) if (WTF::TextEncoding(matchEncoding) == hintEncoding) { 965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) encoding = hintEncodingName; 975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) break; 985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 10102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // If no match is found so far, just pick the top match. 1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // This can happen, say, when a parent frame in EUC-JP refers to 1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // a child frame in Shift_JIS and both frames do NOT specify the encoding 1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // making us resort to auto-detection (when it IS turned on). 1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (!encoding && matchesCount > 0) 1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) encoding = ucsdet_getName(matches[0], &status); 1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (U_SUCCESS(status)) { 10881a5157921f1d2a7ff6aae115bfe3c139b38a5c8Torne (Richard Coles) *detectedEncoding = WTF::TextEncoding(encoding); 1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ucsdet_close(detector); 1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 11102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } 1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ucsdet_close(detector); 1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 117