17dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Copyright 2013 The Chromium Authors. All rights reserved. 27dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Use of this source code is governed by a BSD-style license that can be 37dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// found in the LICENSE file. 47dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 57dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "chrome/common/translate/language_detection_util.h" 67dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 77dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/strings/string16.h" 87dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/strings/utf_string_conversions.h" 97dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "chrome/common/chrome_constants.h" 107dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "testing/gtest/include/gtest/gtest.h" 117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 127dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochtypedef testing::Test LanguageDetectionUtilTest; 137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 147dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that well-known language code typos are fixed. 157dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) { 167dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string language; 177dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 187dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch // Strip the second and later codes. 197dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ja,en"); 207dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); 217dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("ja", language); 227dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 237dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch // Replace dash with hyphen. 247dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ja_JP"); 257dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); 267dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("ja-JP", language); 277dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 287dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch // Correct wrong cases. 297dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("JA-jp"); 307dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); 317dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("ja-JP", language); 327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests if the language codes' format is invalid. 357dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) { 367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string language; 377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ja"); 397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ja-JP"); 427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 447dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ceb"); 457dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 467dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 477dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ceb-XX"); 487dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 497dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 507dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch // Invalid because the sub code consists of a number. 517dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("utf-8"); 527dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); 537dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 547dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch // Invalid because of six characters after hyphen. 557dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("ja-YUKARI"); 567dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); 577dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 587dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch // Invalid because of four characters. 597dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch language = std::string("DHMO"); 607dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); 617dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 627dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 637dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that similar language table works. 647dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) { 657dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "en")); 667dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "ja")); 677dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hr")); 687dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("sr-ME", "sr")); 697dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("ne", "hi")); 707dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hi")); 717dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 727dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 737dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that well-known languages which often have wrong server configuration 747dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// are handles. 757dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) { 767dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", "ja")); 777dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en-US", 787dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "ja")); 797dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", 807dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "zh-CN")); 817dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("ja", 827dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "en")); 837dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", 847dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "he")); 857dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 867dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 877dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing wrong information is ignored by 887dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// LanguageDetectionUtil due to disagreement between meta tag and CLD. 897dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) { 907dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch base::string16 contents = ASCIIToUTF16( 917dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<html><head><meta http-equiv='Content-Language' content='ja'></head>" 927dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<body>This is a page apparently written in English. Even though " 937dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "content-language is provided, the value will be ignored if the value " 947dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "is suspicious.</body></html>"); 957dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string cld_language; 967dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch bool is_cld_reliable; 977dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string language = LanguageDetectionUtil::DeterminePageLanguage( 987dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string("ja"), std::string(), contents, &cld_language, 997dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch &is_cld_reliable); 1007dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ(chrome::kUnknownLanguageCode, language); 1017dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en", cld_language); 1027dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(is_cld_reliable); 1037dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 1047dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 1057dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing "en-US" style information is 1067dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// agreed by CLD. 1077dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) { 1087dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch base::string16 contents = ASCIIToUTF16( 1097dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<html><head><meta http-equiv='Content-Language' content='en-US'></head>" 1107dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<body>This is a page apparently written in English. Even though " 1117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "content-language is provided, the value will be ignored if the value " 1127dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "is suspicious.</body></html>"); 1137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string cld_language; 1147dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch bool is_cld_reliable; 1157dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string language = LanguageDetectionUtil::DeterminePageLanguage( 1167dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string("en-US"), std::string(), contents, &cld_language, 1177dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch &is_cld_reliable); 1187dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en-US", language); 1197dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en", cld_language); 1207dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(is_cld_reliable); 1217dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 1227dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 1237dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing wrong information is ignored and 1247dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// CLD's language will be adopted by LanguageDetectionUtil due to an invalid 1257dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// meta tag. 1267dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) { 1277dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch base::string16 contents = ASCIIToUTF16( 1287dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>" 1297dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<body>This is a page apparently written in English. Even though " 1307dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "content-language is provided, the value will be ignored and CLD's" 1317dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch " language will be adopted if the value is invalid.</body></html>"); 1327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string cld_language; 1337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch bool is_cld_reliable; 1347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string language = LanguageDetectionUtil::DeterminePageLanguage( 1357dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string("utf-8"), std::string(), contents, &cld_language, 1367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch &is_cld_reliable); 1377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en", language); 1387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en", cld_language); 1397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(is_cld_reliable); 1407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 1417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch 1427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing wrong information is ignored 1437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// because of valid html lang attribute. 1447dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) { 1457dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch base::string16 contents = ASCIIToUTF16( 1467dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>" 1477dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "</head><body>This is a page apparently written in English. Even though " 1487dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "content-language is provided, the value will be ignored if the value " 1497dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch "is suspicious.</body></html>"); 1507dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string cld_language; 1517dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch bool is_cld_reliable; 1527dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string language = LanguageDetectionUtil::DeterminePageLanguage( 1537dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch std::string("ja"), std::string("en"), contents, &cld_language, 1547dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch &is_cld_reliable); 1557dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en", language); 1567dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_EQ("en", cld_language); 1577dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch EXPECT_TRUE(is_cld_reliable); 1587dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch} 159