17dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Copyright 2013 The Chromium Authors. All rights reserved.
27dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Use of this source code is governed by a BSD-style license that can be
37dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// found in the LICENSE file.
47dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
57dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "chrome/common/translate/language_detection_util.h"
67dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
77dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/strings/string16.h"
87dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "base/strings/utf_string_conversions.h"
97dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "chrome/common/chrome_constants.h"
107dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch#include "testing/gtest/include/gtest/gtest.h"
117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
127dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdochtypedef testing::Test LanguageDetectionUtilTest;
137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
147dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that well-known language code typos are fixed.
157dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) {
167dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string language;
177dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
187dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // Strip the second and later codes.
197dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ja,en");
207dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  LanguageDetectionUtil::CorrectLanguageCodeTypo(&language);
217dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("ja", language);
227dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
237dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // Replace dash with hyphen.
247dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ja_JP");
257dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  LanguageDetectionUtil::CorrectLanguageCodeTypo(&language);
267dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("ja-JP", language);
277dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
287dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // Correct wrong cases.
297dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("JA-jp");
307dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  LanguageDetectionUtil::CorrectLanguageCodeTypo(&language);
317dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("ja-JP", language);
327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests if the language codes' format is invalid.
357dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) {
367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string language;
377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ja");
397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ja-JP");
427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
447dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ceb");
457dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
467dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
477dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ceb-XX");
487dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language));
497dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
507dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // Invalid because the sub code consists of a number.
517dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("utf-8");
527dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language));
537dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
547dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // Invalid because of six characters after hyphen.
557dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("ja-YUKARI");
567dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language));
577dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
587dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  // Invalid because of four characters.
597dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  language = std::string("DHMO");
607dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language));
617dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
627dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
637dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that similar language table works.
647dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) {
657dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "en"));
667dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "ja"));
677dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hr"));
687dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("sr-ME", "sr"));
697dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("ne", "hi"));
707dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hi"));
717dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
727dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
737dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that well-known languages which often have wrong server configuration
747dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// are handles.
757dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) {
767dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", "ja"));
777dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en-US",
787dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                                                                   "ja"));
797dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en",
807dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                                                                   "zh-CN"));
817dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("ja",
827dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                                                                    "en"));
837dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en",
847dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch                                                                    "he"));
857dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
867dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
877dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing wrong information is ignored by
887dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// LanguageDetectionUtil due to disagreement between meta tag and CLD.
897dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) {
907dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  base::string16 contents = ASCIIToUTF16(
917dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<html><head><meta http-equiv='Content-Language' content='ja'></head>"
927dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<body>This is a page apparently written in English. Even though "
937dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "content-language is provided, the value will be ignored if the value "
947dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "is suspicious.</body></html>");
957dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string cld_language;
967dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  bool is_cld_reliable;
977dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
987dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      std::string("ja"), std::string(), contents, &cld_language,
997dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      &is_cld_reliable);
1007dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ(chrome::kUnknownLanguageCode, language);
1017dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en", cld_language);
1027dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(is_cld_reliable);
1037dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
1047dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1057dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing "en-US" style information is
1067dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// agreed by CLD.
1077dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
1087dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  base::string16 contents = ASCIIToUTF16(
1097dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<html><head><meta http-equiv='Content-Language' content='en-US'></head>"
1107dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<body>This is a page apparently written in English. Even though "
1117dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "content-language is provided, the value will be ignored if the value "
1127dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "is suspicious.</body></html>");
1137dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string cld_language;
1147dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  bool is_cld_reliable;
1157dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
1167dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      std::string("en-US"), std::string(), contents, &cld_language,
1177dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      &is_cld_reliable);
1187dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en-US", language);
1197dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en", cld_language);
1207dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(is_cld_reliable);
1217dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
1227dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1237dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing wrong information is ignored and
1247dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// CLD's language will be adopted by LanguageDetectionUtil due to an invalid
1257dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// meta tag.
1267dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) {
1277dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  base::string16 contents = ASCIIToUTF16(
1287dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>"
1297dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<body>This is a page apparently written in English. Even though "
1307dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "content-language is provided, the value will be ignored and CLD's"
1317dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      " language will be adopted if the value is invalid.</body></html>");
1327dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string cld_language;
1337dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  bool is_cld_reliable;
1347dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
1357dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      std::string("utf-8"), std::string(), contents, &cld_language,
1367dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      &is_cld_reliable);
1377dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en", language);
1387dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en", cld_language);
1397dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(is_cld_reliable);
1407dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
1417dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch
1427dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// Tests that the language meta tag providing wrong information is ignored
1437dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch// because of valid html lang attribute.
1447dbb3d5cf0c15f500944d211057644d6a2f37371Ben MurdochTEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) {
1457dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  base::string16 contents = ASCIIToUTF16(
1467dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
1477dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "</head><body>This is a page apparently written in English. Even though "
1487dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "content-language is provided, the value will be ignored if the value "
1497dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      "is suspicious.</body></html>");
1507dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string cld_language;
1517dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  bool is_cld_reliable;
1527dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  std::string language = LanguageDetectionUtil::DeterminePageLanguage(
1537dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      std::string("ja"), std::string("en"), contents, &cld_language,
1547dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch      &is_cld_reliable);
1557dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en", language);
1567dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_EQ("en", cld_language);
1577dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch  EXPECT_TRUE(is_cld_reliable);
1587dbb3d5cf0c15f500944d211057644d6a2f37371Ben Murdoch}
159