icu_encoding_detection.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_encoding_detection.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/string_util.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/ucsdet.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base { 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DetectEncoding(const std::string& text, std::string* encoding) { 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsStringASCII(text)) { 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *encoding = std::string(); 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UCharsetDetector* detector = ucsdet_open(&status); 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &status); 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const UCharsetMatch* match = ucsdet_detect(detector, &status); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (match == NULL) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* detected_encoding = ucsdet_getName(match, &status); 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucsdet_close(detector); 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (U_FAILURE(status)) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *encoding = detected_encoding; 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DetectAllEncodings(const std::string& text, 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<std::string>* encodings) { 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UCharsetDetector* detector = ucsdet_open(&status); 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &status); 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int matches_count = 0; 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const UCharsetMatch** matches = ucsdet_detectAll(detector, 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &matches_count, 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &status); 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (U_FAILURE(status)) { 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucsdet_close(detector); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ICU has some heuristics for encoding detection, such that the more likely 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // encodings should be returned first. However, it doesn't always return 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // all encodings that properly decode |text|, so we'll append more encodings 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // later. To make that efficient, keep track of encodings sniffed in this 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // first phase. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::set<std::string> sniffed_encodings; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encodings->clear(); 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < matches_count; i++) { 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode get_name_status = U_ZERO_ERROR; 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* encoding_name = ucsdet_getName(matches[i], &get_name_status); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If we failed to get the encoding's name, ignore the error. 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (U_FAILURE(get_name_status)) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We also treat this error as non-fatal. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (U_FAILURE(get_name_status)) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A confidence level >= 10 means that the encoding is expected to properly 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // decode the text. Drop all encodings with lower confidence level. 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (confidence < 10) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encodings->push_back(encoding_name); 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sniffed_encodings.insert(encoding_name); 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Append all encodings not included earlier, in arbitrary order. 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(jshin): This shouldn't be necessary, possible ICU bug. 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // See also http://crbug.com/65917. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector, 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &status); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int detectable_count = uenum_count(detectable_encodings, &status); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < detectable_count; i++) { 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int name_length; 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* name_raw = uenum_next(detectable_encodings, 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &name_length, 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &status); 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string name(name_raw, name_length); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sniffed_encodings.find(name) == sniffed_encodings.end()) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encodings->push_back(name); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uenum_close(detectable_encodings); 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ucsdet_close(detector); 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !encodings->empty(); 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace base 105