cld_unicodetext.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_unicodetext.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>  // to compile bar/common/component.h
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/compact_lang_det.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/string_byte_sink.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/string_util.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/normlzr.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/unistr.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/ustring.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)std::string NormalizeText(const UChar* text) {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // To avoid a copy, use the read-only aliasing ctor.
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::UnicodeString source(1, text, -1);
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::UnicodeString normalized;
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_FAILURE(status))
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return std::string();
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  normalized.toLower();
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string utf8;
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Internally, toUTF8 uses a 1kB stack buffer (which is not large enough
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // for most web pages) and does pre-flighting followed by malloc for larger
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // strings. We have to switch to obtaining the buffer with the maximum size
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (UTF-16 length * 3) without pre-flighting if necessary.
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringByteSink sink(&utf8);
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  normalized.toUTF8(sink);
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return utf8;
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Detects a language of the UTF-16 encoded zero-terminated text.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns: Language enum.
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Language DetectLanguageOfUnicodeText(
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const CompactLangDet::DetectionTables* detection_tables,
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const UChar* text, bool is_plain_text,
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    bool* is_reliable, int* num_languages,
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int* error_code, int* text_bytes) {
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!text || !num_languages)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NUM_LANGUAGES;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Normalize text to NFC, lowercase and convert to UTF-8.
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string utf8_encoded = NormalizeText(text);
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (utf8_encoded.empty())
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NUM_LANGUAGES;
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Engage core CLD library language detection.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Language language3[3] = {
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int percent3[3] = { 0, 0, 0 };
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int text_bytes_tmp = 0;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We ignore return value here due to the problem described in bug 1800161.
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // For example, translate.google.com was detected as Indonesian.  It happened
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // due to the heuristic in CLD, which ignores English as a top language
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in the presence of another reliably detected language.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // language3 array is always set according to the detection results and
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is not affected by this heuristic.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CompactLangDet::DetectLanguageSummary(detection_tables,
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                        utf8_encoded.c_str(),
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                        utf8_encoded.length(),
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                        is_plain_text, language3, percent3,
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                        &text_bytes_tmp, is_reliable);
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Calcualte a number of languages detected in more than 20% of the text.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kMinTextPercentToCountLanguage = 20;
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *num_languages = 0;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (text_bytes)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *text_bytes = text_bytes_tmp;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 language3_and_percent3_should_be_of_the_same_size);
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < arraysize(language3); ++i) {
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        percent3[i] >= kMinTextPercentToCountLanguage) {
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ++*num_languages;
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return language3[0];
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
86