icu_encoding_detection.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/i18n/icu_encoding_detection.h" 6 7#include "base/string_util.h" 8#include "unicode/ucsdet.h" 9 10namespace base { 11 12// TODO(jungshik): We can apply more heuristics here (e.g. using various hints 13// like TLD, the UI language/default encoding of a client, etc). 14bool DetectEncoding(const std::string& text, std::string* encoding) { 15 if (IsStringASCII(text)) { 16 *encoding = std::string(); 17 return true; 18 } 19 20 UErrorCode status = U_ZERO_ERROR; 21 UCharsetDetector* detector = ucsdet_open(&status); 22 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), 23 &status); 24 // TODO(jungshik): Should we check the quality of the match? A rather 25 // arbitrary number is assigned by ICU and it's hard to come up with 26 // a lower limit. 27 const UCharsetMatch* match = ucsdet_detect(detector, &status); 28 const char* detected_encoding = ucsdet_getName(match, &status); 29 ucsdet_close(detector); 30 31 if (U_FAILURE(status)) 32 return false; 33 34 *encoding = detected_encoding; 35 return true; 36} 37 38} // namespace base 39