1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "components/translate/language_detection/language_detection_util.h" 6 7#include "base/strings/string16.h" 8#include "base/strings/utf_string_conversions.h" 9#include "components/translate/common/translate_constants.h" 10#include "testing/gtest/include/gtest/gtest.h" 11 12typedef testing::Test LanguageDetectionUtilTest; 13 14// Tests that well-known language code typos are fixed. 15TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) { 16 std::string language; 17 18 // Strip the second and later codes. 19 language = std::string("ja,en"); 20 translate::CorrectLanguageCodeTypo(&language); 21 EXPECT_EQ("ja", language); 22 23 // Replace dash with hyphen. 24 language = std::string("ja_JP"); 25 translate::CorrectLanguageCodeTypo(&language); 26 EXPECT_EQ("ja-JP", language); 27 28 // Correct wrong cases. 29 language = std::string("JA-jp"); 30 translate::CorrectLanguageCodeTypo(&language); 31 EXPECT_EQ("ja-JP", language); 32} 33 34// Tests if the language codes' format is invalid. 35TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) { 36 std::string language; 37 38 language = std::string("ja"); 39 EXPECT_TRUE(translate::IsValidLanguageCode(language)); 40 41 language = std::string("ja-JP"); 42 EXPECT_TRUE(translate::IsValidLanguageCode(language)); 43 44 language = std::string("ceb"); 45 EXPECT_TRUE(translate::IsValidLanguageCode(language)); 46 47 language = std::string("ceb-XX"); 48 EXPECT_TRUE(translate::IsValidLanguageCode(language)); 49 50 // Invalid because the sub code consists of a number. 51 language = std::string("utf-8"); 52 EXPECT_FALSE(translate::IsValidLanguageCode(language)); 53 54 // Invalid because of six characters after hyphen. 55 language = std::string("ja-YUKARI"); 56 EXPECT_FALSE(translate::IsValidLanguageCode(language)); 57 58 // Invalid because of four characters. 59 language = std::string("DHMO"); 60 EXPECT_FALSE(translate::IsValidLanguageCode(language)); 61} 62 63// Tests that similar language table works. 64TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) { 65 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("en", "en")); 66 EXPECT_FALSE(translate::IsSameOrSimilarLanguages("en", "ja")); 67 68 // Language codes are same if the main parts are same. The synonyms should be 69 // took into account (ex: 'iw' and 'he'). 70 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr-ME", "sr")); 71 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr", "sr-ME")); 72 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("he", "he-IL")); 73 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng", "eng-US")); 74 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng-US", "eng")); 75 EXPECT_FALSE(translate::IsSameOrSimilarLanguages("eng", "enm")); 76 77 // Even though the main parts are different, some special language pairs are 78 // recognized as same languages. 79 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("bs", "hr")); 80 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("ne", "hi")); 81 EXPECT_FALSE(translate::IsSameOrSimilarLanguages("bs", "hi")); 82} 83 84// Tests that well-known languages which often have wrong server configuration 85// are handles. 86TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) { 87 EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "ja")); 88 EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en-US", "ja")); 89 EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "zh-CN")); 90 EXPECT_FALSE(translate::MaybeServerWrongConfiguration("ja", "en")); 91 EXPECT_FALSE(translate::MaybeServerWrongConfiguration("en", "he")); 92} 93 94// Tests that the language meta tag providing wrong information is ignored by 95// LanguageDetectionUtil due to disagreement between meta tag and CLD. 96TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) { 97 base::string16 contents = ASCIIToUTF16( 98 "<html><head><meta http-equiv='Content-Language' content='ja'></head>" 99 "<body>This is a page apparently written in English. Even though " 100 "content-language is provided, the value will be ignored if the value " 101 "is suspicious.</body></html>"); 102 std::string cld_language; 103 bool is_cld_reliable; 104 std::string language = translate::DeterminePageLanguage(std::string("ja"), 105 std::string(), 106 contents, 107 &cld_language, 108 &is_cld_reliable); 109 EXPECT_EQ(translate::kUnknownLanguageCode, language); 110 EXPECT_EQ("en", cld_language); 111 EXPECT_TRUE(is_cld_reliable); 112} 113 114// Tests that the language meta tag providing "en-US" style information is 115// agreed by CLD. 116TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) { 117 base::string16 contents = ASCIIToUTF16( 118 "<html><head><meta http-equiv='Content-Language' content='en-US'></head>" 119 "<body>This is a page apparently written in English. Even though " 120 "content-language is provided, the value will be ignored if the value " 121 "is suspicious.</body></html>"); 122 std::string cld_language; 123 bool is_cld_reliable; 124 std::string language = translate::DeterminePageLanguage(std::string("en-US"), 125 std::string(), 126 contents, 127 &cld_language, 128 &is_cld_reliable); 129 EXPECT_EQ("en-US", language); 130 EXPECT_EQ("en", cld_language); 131 EXPECT_TRUE(is_cld_reliable); 132} 133 134// Tests that the language meta tag providing wrong information is ignored and 135// CLD's language will be adopted by LanguageDetectionUtil due to an invalid 136// meta tag. 137TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) { 138 base::string16 contents = ASCIIToUTF16( 139 "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>" 140 "<body>This is a page apparently written in English. Even though " 141 "content-language is provided, the value will be ignored and CLD's" 142 " language will be adopted if the value is invalid.</body></html>"); 143 std::string cld_language; 144 bool is_cld_reliable; 145 std::string language = translate::DeterminePageLanguage(std::string("utf-8"), 146 std::string(), 147 contents, 148 &cld_language, 149 &is_cld_reliable); 150 EXPECT_EQ("en", language); 151 EXPECT_EQ("en", cld_language); 152 EXPECT_TRUE(is_cld_reliable); 153} 154 155// Tests that the language meta tag providing wrong information is ignored 156// because of valid html lang attribute. 157TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) { 158 base::string16 contents = ASCIIToUTF16( 159 "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>" 160 "</head><body>This is a page apparently written in English. Even though " 161 "content-language is provided, the value will be ignored if the value " 162 "is suspicious.</body></html>"); 163 std::string cld_language; 164 bool is_cld_reliable; 165 std::string language = translate::DeterminePageLanguage(std::string("ja"), 166 std::string("en"), 167 contents, 168 &cld_language, 169 &is_cld_reliable); 170 EXPECT_EQ("en", language); 171 EXPECT_EQ("en", cld_language); 172 EXPECT_TRUE(is_cld_reliable); 173} 174