icu_encoding_detection.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/i18n/icu_encoding_detection.h"
6
7#include "base/string_util.h"
8#include "unicode/ucsdet.h"
9
10namespace base {
11
12// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
13// like TLD, the UI language/default encoding of a client, etc).
14bool DetectEncoding(const std::string& text, std::string* encoding) {
15  if (IsStringASCII(text)) {
16    *encoding = std::string();
17    return true;
18  }
19
20  UErrorCode status = U_ZERO_ERROR;
21  UCharsetDetector* detector = ucsdet_open(&status);
22  ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
23                 &status);
24  // TODO(jungshik): Should we check the quality of the match? A rather
25  // arbitrary number is assigned by ICU and it's hard to come up with
26  // a lower limit.
27  const UCharsetMatch* match = ucsdet_detect(detector, &status);
28  const char* detected_encoding = ucsdet_getName(match, &status);
29  ucsdet_close(detector);
30
31  if (U_FAILURE(status))
32    return false;
33
34  *encoding = detected_encoding;
35  return true;
36}
37
38}  // namespace base
39