icu_encoding_detection.cc revision 868fa2fe829687343ffae624259930155e16dbd8
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_encoding_detection.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/icu/public/i18n/unicode/ucsdet.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base {
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DetectEncoding(const std::string& text, std::string* encoding) {
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (IsStringASCII(text)) {
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *encoding = std::string();
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return true;
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UCharsetDetector* detector = ucsdet_open(&status);
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 &status);
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const UCharsetMatch* match = ucsdet_detect(detector, &status);
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (match == NULL)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* detected_encoding = ucsdet_getName(match, &status);
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucsdet_close(detector);
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_FAILURE(status))
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *encoding = detected_encoding;
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DetectAllEncodings(const std::string& text,
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                        std::vector<std::string>* encodings) {
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UCharsetDetector* detector = ucsdet_open(&status);
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 &status);
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int matches_count = 0;
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const UCharsetMatch** matches = ucsdet_detectAll(detector,
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                   &matches_count,
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                   &status);
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_FAILURE(status)) {
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ucsdet_close(detector);
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ICU has some heuristics for encoding detection, such that the more likely
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // encodings should be returned first. However, it doesn't always return
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // all encodings that properly decode |text|, so we'll append more encodings
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // later. To make that efficient, keep track of encodings sniffed in this
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // first phase.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::set<std::string> sniffed_encodings;
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encodings->clear();
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < matches_count; i++) {
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UErrorCode get_name_status = U_ZERO_ERROR;
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* encoding_name = ucsdet_getName(matches[i], &get_name_status);
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // If we failed to get the encoding's name, ignore the error.
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (U_FAILURE(get_name_status))
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status);
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We also treat this error as non-fatal.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (U_FAILURE(get_name_status))
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // A confidence level >= 10 means that the encoding is expected to properly
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // decode the text. Drop all encodings with lower confidence level.
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (confidence < 10)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    encodings->push_back(encoding_name);
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sniffed_encodings.insert(encoding_name);
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Append all encodings not included earlier, in arbitrary order.
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // TODO(jshin): This shouldn't be necessary, possible ICU bug.
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // See also http://crbug.com/65917.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector,
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                                       &status);
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int detectable_count = uenum_count(detectable_encodings, &status);
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < detectable_count; i++) {
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int name_length;
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* name_raw = uenum_next(detectable_encodings,
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      &name_length,
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      &status);
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::string name(name_raw, name_length);
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (sniffed_encodings.find(name) == sniffed_encodings.end())
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      encodings->push_back(name);
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uenum_close(detectable_encodings);
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ucsdet_close(detector);
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return !encodings->empty();
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace base
105