1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "cpp/src/util/canonicalize_string.h"
6
7#include "base/logging.h"
8#include "cpp/include/libaddressinput/util/scoped_ptr.h"
9#include "third_party/icu/source/common/unicode/errorcode.h"
10#include "third_party/icu/source/common/unicode/locid.h"
11#include "third_party/icu/source/common/unicode/unistr.h"
12#include "third_party/icu/source/common/unicode/utypes.h"
13#include "third_party/icu/source/i18n/unicode/coll.h"
14
15namespace i18n {
16namespace addressinput {
17
18namespace {
19
20class ChromeStringCanonicalizer : public StringCanonicalizer {
21 public:
22  ChromeStringCanonicalizer()
23      : error_code_(U_ZERO_ERROR),
24        collator_(
25            icu::Collator::createInstance(
26                icu::Locale::getRoot(), error_code_)) {
27    collator_->setStrength(icu::Collator::PRIMARY);
28    DCHECK(U_SUCCESS(error_code_));
29  }
30
31  virtual ~ChromeStringCanonicalizer() {}
32
33  // StringCanonicalizer implementation.
34  virtual std::string CanonicalizeString(const std::string& original) {
35    // Returns a canonical version of the string that can be used for comparing
36    // strings regardless of diacritics and capitalization.
37    //    CanonicalizeString("Texas") == CanonicalizeString("T\u00E9xas");
38    //    CanonicalizeString("Texas") == CanonicalizeString("teXas");
39    //    CanonicalizeString("Texas") != CanonicalizeString("California");
40    //
41    // The output is not human-readable.
42    //    CanonicalizeString("Texas") != "Texas";
43    icu::UnicodeString icu_str(
44        original.c_str(), static_cast<int32_t>(original.length()));
45    int32_t buffer_size = collator_->getSortKey(icu_str, NULL, 0);
46    scoped_ptr<uint8_t[]> buffer(new uint8_t[buffer_size]);
47    DCHECK(buffer.get());
48    int32_t filled_size =
49        collator_->getSortKey(icu_str, buffer.get(), buffer_size);
50    DCHECK_EQ(buffer_size, filled_size);
51    return std::string(reinterpret_cast<const char*>(buffer.get()));
52  }
53
54 private:
55  UErrorCode error_code_;
56  scoped_ptr<icu::Collator> collator_;
57
58  DISALLOW_COPY_AND_ASSIGN(ChromeStringCanonicalizer);
59};
60
61}  // namespace
62
63// static
64scoped_ptr<StringCanonicalizer> StringCanonicalizer::Build() {
65  return scoped_ptr<StringCanonicalizer>(new ChromeStringCanonicalizer);
66}
67
68}  // namespace addressinput
69}  // namespace i18n
70