1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/translate/core/language_detection/language_detection_util.h"
6
7#include "base/strings/string16.h"
8#include "base/strings/utf_string_conversions.h"
9#include "components/translate/core/common/translate_constants.h"
10#include "testing/gtest/include/gtest/gtest.h"
11
12typedef testing::Test LanguageDetectionUtilTest;
13
14// Tests that well-known language code typos are fixed.
15TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) {
16  std::string language;
17
18  // Strip the second and later codes.
19  language = std::string("ja,en");
20  translate::CorrectLanguageCodeTypo(&language);
21  EXPECT_EQ("ja", language);
22
23  // Replace dash with hyphen.
24  language = std::string("ja_JP");
25  translate::CorrectLanguageCodeTypo(&language);
26  EXPECT_EQ("ja-JP", language);
27
28  // Correct wrong cases.
29  language = std::string("JA-jp");
30  translate::CorrectLanguageCodeTypo(&language);
31  EXPECT_EQ("ja-JP", language);
32}
33
34// Tests if the language codes' format is invalid.
35TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) {
36  std::string language;
37
38  language = std::string("ja");
39  EXPECT_TRUE(translate::IsValidLanguageCode(language));
40
41  language = std::string("ja-JP");
42  EXPECT_TRUE(translate::IsValidLanguageCode(language));
43
44  language = std::string("ceb");
45  EXPECT_TRUE(translate::IsValidLanguageCode(language));
46
47  language = std::string("ceb-XX");
48  EXPECT_TRUE(translate::IsValidLanguageCode(language));
49
50  // Invalid because the sub code consists of a number.
51  language = std::string("utf-8");
52  EXPECT_FALSE(translate::IsValidLanguageCode(language));
53
54  // Invalid because of six characters after hyphen.
55  language = std::string("ja-YUKARI");
56  EXPECT_FALSE(translate::IsValidLanguageCode(language));
57
58  // Invalid because of four characters.
59  language = std::string("DHMO");
60  EXPECT_FALSE(translate::IsValidLanguageCode(language));
61}
62
63// Tests that similar language table works.
64TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) {
65  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("en", "en"));
66  EXPECT_FALSE(translate::IsSameOrSimilarLanguages("en", "ja"));
67
68  // Language codes are same if the main parts are same. The synonyms should be
69  // took into account (ex: 'iw' and 'he').
70  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr-ME", "sr"));
71  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr", "sr-ME"));
72  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("he", "he-IL"));
73  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng", "eng-US"));
74  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng-US", "eng"));
75  EXPECT_FALSE(translate::IsSameOrSimilarLanguages("eng", "enm"));
76
77  // Even though the main parts are different, some special language pairs are
78  // recognized as same languages.
79  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("bs", "hr"));
80  EXPECT_TRUE(translate::IsSameOrSimilarLanguages("ne", "hi"));
81  EXPECT_FALSE(translate::IsSameOrSimilarLanguages("bs", "hi"));
82}
83
84// Tests that well-known languages which often have wrong server configuration
85// are handles.
86TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) {
87  EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "ja"));
88  EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en-US", "ja"));
89  EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "zh-CN"));
90  EXPECT_FALSE(translate::MaybeServerWrongConfiguration("ja", "en"));
91  EXPECT_FALSE(translate::MaybeServerWrongConfiguration("en", "he"));
92}
93
94// Tests that the language meta tag providing wrong information is ignored by
95// LanguageDetectionUtil due to disagreement between meta tag and CLD.
96TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) {
97  base::string16 contents = base::ASCIIToUTF16(
98      "<html><head><meta http-equiv='Content-Language' content='ja'></head>"
99      "<body>This is a page apparently written in English. Even though "
100      "content-language is provided, the value will be ignored if the value "
101      "is suspicious.</body></html>");
102  std::string cld_language;
103  bool is_cld_reliable;
104  std::string language = translate::DeterminePageLanguage(std::string("ja"),
105                                                          std::string(),
106                                                          contents,
107                                                          &cld_language,
108                                                          &is_cld_reliable);
109  EXPECT_EQ(translate::kUnknownLanguageCode, language);
110  EXPECT_EQ("en", cld_language);
111  EXPECT_TRUE(is_cld_reliable);
112}
113
114// Tests that the language meta tag providing "en-US" style information is
115// agreed by CLD.
116TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
117  base::string16 contents = base::ASCIIToUTF16(
118      "<html><head><meta http-equiv='Content-Language' content='en-US'></head>"
119      "<body>This is a page apparently written in English. Even though "
120      "content-language is provided, the value will be ignored if the value "
121      "is suspicious.</body></html>");
122  std::string cld_language;
123  bool is_cld_reliable;
124  std::string language = translate::DeterminePageLanguage(std::string("en-US"),
125                                                          std::string(),
126                                                          contents,
127                                                          &cld_language,
128                                                          &is_cld_reliable);
129  EXPECT_EQ("en-US", language);
130  EXPECT_EQ("en", cld_language);
131  EXPECT_TRUE(is_cld_reliable);
132}
133
134// Tests that the language meta tag providing wrong information is ignored and
135// CLD's language will be adopted by LanguageDetectionUtil due to an invalid
136// meta tag.
137TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) {
138  base::string16 contents = base::ASCIIToUTF16(
139      "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>"
140      "<body>This is a page apparently written in English. Even though "
141      "content-language is provided, the value will be ignored and CLD's"
142      " language will be adopted if the value is invalid.</body></html>");
143  std::string cld_language;
144  bool is_cld_reliable;
145  std::string language = translate::DeterminePageLanguage(std::string("utf-8"),
146                                                          std::string(),
147                                                          contents,
148                                                          &cld_language,
149                                                          &is_cld_reliable);
150  EXPECT_EQ("en", language);
151  EXPECT_EQ("en", cld_language);
152  EXPECT_TRUE(is_cld_reliable);
153}
154
155// Tests that the language meta tag providing wrong information is ignored
156// because of valid html lang attribute.
157TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) {
158  base::string16 contents = base::ASCIIToUTF16(
159      "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
160      "</head><body>This is a page apparently written in English. Even though "
161      "content-language is provided, the value will be ignored if the value "
162      "is suspicious.</body></html>");
163  std::string cld_language;
164  bool is_cld_reliable;
165  std::string language = translate::DeterminePageLanguage(std::string("ja"),
166                                                          std::string("en"),
167                                                          contents,
168                                                          &cld_language,
169                                                          &is_cld_reliable);
170  EXPECT_EQ("en", language);
171  EXPECT_EQ("en", cld_language);
172  EXPECT_TRUE(is_cld_reliable);
173}
174