1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/common/translate/language_detection_util.h"
6
7#include "base/strings/string16.h"
8#include "base/strings/utf_string_conversions.h"
9#include "chrome/common/chrome_constants.h"
10#include "testing/gtest/include/gtest/gtest.h"
11
12typedef testing::Test LanguageDetectionUtilTest;
13
14// Tests that well-known language code typos are fixed.
15TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) {
16  std::string language;
17
18  // Strip the second and later codes.
19  language = std::string("ja,en");
20  LanguageDetectionUtil::CorrectLanguageCodeTypo(&language);
21  EXPECT_EQ("ja", language);
22
23  // Replace dash with hyphen.
24  language = std::string("ja_JP");
25  LanguageDetectionUtil::CorrectLanguageCodeTypo(&language);
26  EXPECT_EQ("ja-JP", language);
27
28  // Correct wrong cases.
29  language = std::string("JA-jp");
30  LanguageDetectionUtil::CorrectLanguageCodeTypo(&language);
31  EXPECT_EQ("ja-JP", language);
32}
33
34// Tests if the language codes' format is invalid.
35TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) {
36  std::string language;
37
38  language = std::string("ja");
39  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
40
41  language = std::string("ja-JP");
42  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
43
44  language = std::string("ceb");
45  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
46
47  language = std::string("ceb-XX");
48  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
49
50  // Invalid because the sub code consists of a number.
51  language = std::string("utf-8");
52  EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language));
53
54  // Invalid because of six characters after hyphen.
55  language = std::string("ja-YUKARI");
56  EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language));
57
58  // Invalid because of four characters.
59  language = std::string("DHMO");
60  EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language));
61}
62
63// Tests that similar language table works.
64TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) {
65  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "en"));
66  EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "ja"));
67  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hr"));
68  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("sr-ME", "sr"));
69  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("ne", "hi"));
70  EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hi"));
71}
72
73// Tests that well-known languages which often have wrong server configuration
74// are handles.
75TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) {
76  EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", "ja"));
77  EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en-US",
78                                                                   "ja"));
79  EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en",
80                                                                   "zh-CN"));
81  EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("ja",
82                                                                    "en"));
83  EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en",
84                                                                    "he"));
85}
86
87// Tests that the language meta tag providing wrong information is ignored by
88// LanguageDetectionUtil due to disagreement between meta tag and CLD.
89TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) {
90  base::string16 contents = ASCIIToUTF16(
91      "<html><head><meta http-equiv='Content-Language' content='ja'></head>"
92      "<body>This is a page apparently written in English. Even though "
93      "content-language is provided, the value will be ignored if the value "
94      "is suspicious.</body></html>");
95  std::string cld_language;
96  bool is_cld_reliable;
97  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
98      std::string("ja"), std::string(), contents, &cld_language,
99      &is_cld_reliable);
100  EXPECT_EQ(chrome::kUnknownLanguageCode, language);
101  EXPECT_EQ("en", cld_language);
102  EXPECT_TRUE(is_cld_reliable);
103}
104
105// Tests that the language meta tag providing "en-US" style information is
106// agreed by CLD.
107TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
108  base::string16 contents = ASCIIToUTF16(
109      "<html><head><meta http-equiv='Content-Language' content='en-US'></head>"
110      "<body>This is a page apparently written in English. Even though "
111      "content-language is provided, the value will be ignored if the value "
112      "is suspicious.</body></html>");
113  std::string cld_language;
114  bool is_cld_reliable;
115  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
116      std::string("en-US"), std::string(), contents, &cld_language,
117      &is_cld_reliable);
118  EXPECT_EQ("en-US", language);
119  EXPECT_EQ("en", cld_language);
120  EXPECT_TRUE(is_cld_reliable);
121}
122
123// Tests that the language meta tag providing wrong information is ignored and
124// CLD's language will be adopted by LanguageDetectionUtil due to an invalid
125// meta tag.
126TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) {
127  base::string16 contents = ASCIIToUTF16(
128      "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>"
129      "<body>This is a page apparently written in English. Even though "
130      "content-language is provided, the value will be ignored and CLD's"
131      " language will be adopted if the value is invalid.</body></html>");
132  std::string cld_language;
133  bool is_cld_reliable;
134  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
135      std::string("utf-8"), std::string(), contents, &cld_language,
136      &is_cld_reliable);
137  EXPECT_EQ("en", language);
138  EXPECT_EQ("en", cld_language);
139  EXPECT_TRUE(is_cld_reliable);
140}
141
142// Tests that the language meta tag providing wrong information is ignored
143// because of valid html lang attribute.
144TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) {
145  base::string16 contents = ASCIIToUTF16(
146      "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
147      "</head><body>This is a page apparently written in English. Even though "
148      "content-language is provided, the value will be ignored if the value "
149      "is suspicious.</body></html>");
150  std::string cld_language;
151  bool is_cld_reliable;
152  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
153      std::string("ja"), std::string("en"), contents, &cld_language,
154      &is_cld_reliable);
155  EXPECT_EQ("en", language);
156  EXPECT_EQ("en", cld_language);
157  EXPECT_TRUE(is_cld_reliable);
158}
159