1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/translate/language_detection/language_detection_util.h"
6
7#include "base/logging.h"
8#include "base/metrics/field_trial.h"
9#include "base/strings/string_split.h"
10#include "base/strings/string_util.h"
11#include "base/strings/utf_string_conversions.h"
12#include "base/time/time.h"
13#include "components/translate/common/translate_constants.h"
14#include "components/translate/common/translate_metrics.h"
15#include "components/translate/common/translate_util.h"
16
17#if !defined(CLD_VERSION) || CLD_VERSION==1
18#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
20#endif
21
22#if !defined(CLD_VERSION) || CLD_VERSION==2
23#include "third_party/cld_2/src/public/compact_lang_det.h"
24#endif
25
26namespace {
27
28// Similar language code list. Some languages are very similar and difficult
29// for CLD to distinguish.
30struct SimilarLanguageCode {
31  const char* const code;
32  int group;
33};
34
35const SimilarLanguageCode kSimilarLanguageCodes[] = {
36  {"bs", 1},
37  {"hr", 1},
38  {"hi", 2},
39  {"ne", 2},
40};
41
42// Checks |kSimilarLanguageCodes| and returns group code.
43int GetSimilarLanguageGroupCode(const std::string& language) {
44  for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
45    if (language.find(kSimilarLanguageCodes[i].code) != 0)
46      continue;
47    return kSimilarLanguageCodes[i].group;
48  }
49  return 0;
50}
51
52// Well-known languages which often have wrong server configuration of
53// Content-Language: en.
54// TODO(toyoshim): Remove these static tables and caller functions to
55// translate/common, and implement them as std::set<>.
56const char* kWellKnownCodesOnWrongConfiguration[] = {
57  "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
58};
59
60// Applies a series of language code modification in proper order.
61void ApplyLanguageCodeCorrection(std::string* code) {
62  // Correct well-known format errors.
63  translate::CorrectLanguageCodeTypo(code);
64
65  if (!translate::IsValidLanguageCode(*code)) {
66    *code = std::string();
67    return;
68  }
69
70  translate::ToTranslateLanguageSynonym(code);
71}
72
73int GetCLDMajorVersion() {
74#if !defined(CLD_VERSION)
75  std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
76  if (group_name == "CLD2")
77    return 2;
78  else
79    return 1;
80#else
81  return CLD_VERSION;
82#endif
83}
84
85// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86// failed.
87// |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88std::string DetermineTextLanguage(const base::string16& text,
89                                  bool* is_cld_reliable) {
90  std::string language = translate::kUnknownLanguageCode;
91  int text_bytes = 0;
92  bool is_reliable = false;
93
94  // Language or CLD2::Language
95  int cld_language = 0;
96  bool is_valid_language = false;
97
98  switch (GetCLDMajorVersion()) {
99#if !defined(CLD_VERSION) || CLD_VERSION==1
100    case 1: {
101      int num_languages = 0;
102      cld_language =
103          DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
104                                      &num_languages, NULL, &text_bytes);
105      is_valid_language = cld_language != NUM_LANGUAGES &&
106          cld_language != UNKNOWN_LANGUAGE &&
107          cld_language != TG_UNKNOWN_LANGUAGE;
108      break;
109    }
110#endif
111#if !defined(CLD_VERSION) || CLD_VERSION==2
112    case 2: {
113      std::string utf8_text(UTF16ToUTF8(text));
114      CLD2::Language language3[3];
115      int percent3[3];
116      CLD2::DetectLanguageSummary(
117          utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,
118          &text_bytes, &is_reliable);
119      cld_language = language3[0];
120      is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
121          cld_language != CLD2::UNKNOWN_LANGUAGE &&
122          cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
123      break;
124    }
125#endif
126    default:
127      NOTREACHED();
128  }
129
130  if (is_cld_reliable != NULL)
131    *is_cld_reliable = is_reliable;
132
133  // We don't trust the result if the CLD reports that the detection is not
134  // reliable, or if the actual text used to detect the language was less than
135  // 100 bytes (short texts can often lead to wrong results).
136  // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
137  // the determined language code is correct with 50% confidence. Chrome should
138  // handle the real confidence value to judge.
139  if (is_reliable && text_bytes >= 100 && is_valid_language) {
140    // We should not use LanguageCode_ISO_639_1 because it does not cover all
141    // the languages CLD can detect. As a result, it'll return the invalid
142    // language code for tradtional Chinese among others.
143    // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
144    // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
145    // for Simplified Chinese.
146    switch (GetCLDMajorVersion()) {
147#if !defined(CLD_VERSION) || CLD_VERSION==1
148      case 1:
149        language =
150            LanguageCodeWithDialects(static_cast<Language>(cld_language));
151        break;
152#endif
153#if !defined(CLD_VERSION) || CLD_VERSION==2
154      case 2:
155        // (1) CLD2's LanguageCode returns general Chinese 'zh' for
156        // CLD2::CHINESE, but Translate server doesn't accept it. This is
157        // converted to 'zh-CN' in the same way as CLD1's
158        // LanguageCodeWithDialects.
159        //
160        // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
161        // CLD2::CHINESE_T. This is technically more precise for the language
162        // code of traditional Chinese, while Translate server hasn't accepted
163        // zh-Hant yet.
164        if (cld_language == CLD2::CHINESE) {
165          language = "zh-CN";
166        } else if (cld_language == CLD2::CHINESE_T) {
167          language = "zh-TW";
168        } else {
169          language =
170              CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
171        }
172        break;
173#endif
174      default:
175        NOTREACHED();
176    }
177  }
178  VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
179          << "\n*************************************\n";
180  return language;
181}
182
183// Checks if CLD can complement a sub code when the page language doesn't know
184// the sub code.
185bool CanCLDComplementSubCode(
186    const std::string& page_language, const std::string& cld_language) {
187  // Translate server cannot treat general Chinese. If Content-Language and
188  // CLD agree that the language is Chinese and Content-Language doesn't know
189  // which dialect is used, CLD language has priority.
190  // TODO(hajimehoshi): How about the other dialects like zh-MO?
191  return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
192}
193
194}  // namespace
195
196namespace translate {
197
198std::string DeterminePageLanguage(const std::string& code,
199                                  const std::string& html_lang,
200                                  const base::string16& contents,
201                                  std::string* cld_language_p,
202                                  bool* is_cld_reliable_p) {
203  base::TimeTicks begin_time = base::TimeTicks::Now();
204  bool is_cld_reliable;
205  std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
206  translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
207
208  if (cld_language_p != NULL)
209    *cld_language_p = cld_language;
210  if (is_cld_reliable_p != NULL)
211    *is_cld_reliable_p = is_cld_reliable;
212  translate::ToTranslateLanguageSynonym(&cld_language);
213
214  // Check if html lang attribute is valid.
215  std::string modified_html_lang;
216  if (!html_lang.empty()) {
217    modified_html_lang = html_lang;
218    ApplyLanguageCodeCorrection(&modified_html_lang);
219    translate::ReportHtmlLang(html_lang, modified_html_lang);
220    VLOG(9) << "html lang based language code: " << modified_html_lang;
221  }
222
223  // Check if Content-Language is valid.
224  std::string modified_code;
225  if (!code.empty()) {
226    modified_code = code;
227    ApplyLanguageCodeCorrection(&modified_code);
228    translate::ReportContentLanguage(code, modified_code);
229  }
230
231  // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
232  // |modified_code|.
233  std::string language = modified_html_lang.empty() ? modified_code :
234                                                      modified_html_lang;
235
236  // If |language| is empty, just use CLD result even though it might be
237  // translate::kUnknownLanguageCode.
238  if (language.empty()) {
239    translate::ReportLanguageVerification(
240        translate::LANGUAGE_VERIFICATION_CLD_ONLY);
241    return cld_language;
242  }
243
244  if (cld_language == kUnknownLanguageCode) {
245    translate::ReportLanguageVerification(
246        translate::LANGUAGE_VERIFICATION_UNKNOWN);
247    return language;
248  } else if (CanCLDComplementSubCode(language, cld_language)) {
249    translate::ReportLanguageVerification(
250        translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
251    return cld_language;
252  } else if (IsSameOrSimilarLanguages(language, cld_language)) {
253    translate::ReportLanguageVerification(
254        translate::LANGUAGE_VERIFICATION_CLD_AGREE);
255    return language;
256  } else if (MaybeServerWrongConfiguration(language, cld_language)) {
257    translate::ReportLanguageVerification(
258        translate::LANGUAGE_VERIFICATION_TRUST_CLD);
259    return cld_language;
260  } else {
261    translate::ReportLanguageVerification(
262        translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
263    // Content-Language value might be wrong because CLD says that this page
264    // is written in another language with confidence.
265    // In this case, Chrome doesn't rely on any of the language codes, and
266    // gives up suggesting a translation.
267    return std::string(kUnknownLanguageCode);
268  }
269
270  return language;
271}
272
273void CorrectLanguageCodeTypo(std::string* code) {
274  DCHECK(code);
275
276  size_t coma_index = code->find(',');
277  if (coma_index != std::string::npos) {
278    // There are more than 1 language specified, just keep the first one.
279    *code = code->substr(0, coma_index);
280  }
281  TrimWhitespaceASCII(*code, TRIM_ALL, code);
282
283  // An underscore instead of a dash is a frequent mistake.
284  size_t underscore_index = code->find('_');
285  if (underscore_index != std::string::npos)
286    (*code)[underscore_index] = '-';
287
288  // Change everything up to a dash to lower-case and everything after to upper.
289  size_t dash_index = code->find('-');
290  if (dash_index != std::string::npos) {
291    *code = StringToLowerASCII(code->substr(0, dash_index)) +
292        StringToUpperASCII(code->substr(dash_index));
293  } else {
294    *code = StringToLowerASCII(*code);
295  }
296}
297
298bool IsValidLanguageCode(const std::string& code) {
299  // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
300  // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
301  std::vector<std::string> chunks;
302  base::SplitString(code, '-', &chunks);
303
304  if (chunks.size() < 1 || 2 < chunks.size())
305    return false;
306
307  const std::string& main_code = chunks[0];
308
309  if (main_code.size() < 1 || 3 < main_code.size())
310    return false;
311
312  for (std::string::const_iterator it = main_code.begin();
313       it != main_code.end(); ++it) {
314    if (!IsAsciiAlpha(*it))
315      return false;
316  }
317
318  if (chunks.size() == 1)
319    return true;
320
321  const std::string& sub_code = chunks[1];
322
323  if (sub_code.size() != 2)
324    return false;
325
326  for (std::string::const_iterator it = sub_code.begin();
327       it != sub_code.end(); ++it) {
328    if (!IsAsciiAlpha(*it))
329      return false;
330  }
331
332  return true;
333}
334
335bool IsSameOrSimilarLanguages(const std::string& page_language,
336                              const std::string& cld_language) {
337  std::vector<std::string> chunks;
338
339  base::SplitString(page_language, '-', &chunks);
340  if (chunks.size() == 0)
341    return false;
342  std::string page_language_main_part = chunks[0];
343
344  base::SplitString(cld_language, '-', &chunks);
345  if (chunks.size() == 0)
346    return false;
347  std::string cld_language_main_part = chunks[0];
348
349  // Language code part of |page_language| is matched to one of |cld_language|.
350  // Country code is ignored here.
351  if (page_language_main_part == cld_language_main_part) {
352    // Languages are matched strictly. Reports false to metrics, but returns
353    // true.
354    translate::ReportSimilarLanguageMatch(false);
355    return true;
356  }
357
358  // Check if |page_language| and |cld_language| are in the similar language
359  // list and belong to the same language group.
360  int page_code = GetSimilarLanguageGroupCode(page_language);
361  bool match = page_code != 0 &&
362               page_code == GetSimilarLanguageGroupCode(cld_language);
363
364  translate::ReportSimilarLanguageMatch(match);
365  return match;
366}
367
368bool MaybeServerWrongConfiguration(const std::string& page_language,
369                                   const std::string& cld_language) {
370  // If |page_language| is not "en-*", respect it and just return false here.
371  if (!StartsWithASCII(page_language, "en", false))
372    return false;
373
374  // A server provides a language meta information representing "en-*". But it
375  // might be just a default value due to missing user configuration.
376  // Let's trust |cld_language| if the determined language is not difficult to
377  // distinguish from English, and the language is one of well-known languages
378  // which often provide "en-*" meta information mistakenly.
379  for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
380    if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
381      return true;
382  }
383  return false;
384}
385
386}  // namespace translate
387