cld_unicodetext.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_unicodetext.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> // to compile bar/common/component.h 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/compact_lang_det.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/string_byte_sink.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/string_util.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/normlzr.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/unistr.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "unicode/ustring.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)std::string NormalizeText(const UChar* text) { 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // To avoid a copy, use the read-only aliasing ctor. 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) icu::UnicodeString source(1, text, -1); 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) icu::UnicodeString normalized; 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status); 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (U_FAILURE(status)) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return std::string(); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normalized.toLower(); 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string utf8; 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Internally, toUTF8 uses a 1kB stack buffer (which is not large enough 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // for most web pages) and does pre-flighting followed by malloc for larger 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // strings. We have to switch to obtaining the buffer with the maximum size 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (UTF-16 length * 3) without pre-flighting if necessary. 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringByteSink sink(&utf8); 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) normalized.toUTF8(sink); 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return utf8; 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Detects a language of the UTF-16 encoded zero-terminated text. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns: Language enum. 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Language DetectLanguageOfUnicodeText( 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const CompactLangDet::DetectionTables* detection_tables, 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const UChar* text, bool is_plain_text, 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool* is_reliable, int* num_languages, 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* error_code, int* text_bytes) { 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!text || !num_languages) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NUM_LANGUAGES; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Normalize text to NFC, lowercase and convert to UTF-8. 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string utf8_encoded = NormalizeText(text); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (utf8_encoded.empty()) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NUM_LANGUAGES; 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Engage core CLD library language detection. 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Language language3[3] = { 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int percent3[3] = { 0, 0, 0 }; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int text_bytes_tmp = 0; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We ignore return value here due to the problem described in bug 1800161. 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For example, translate.google.com was detected as Indonesian. It happened 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // due to the heuristic in CLD, which ignores English as a top language 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the presence of another reliably detected language. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function. 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // language3 array is always set according to the detection results and 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is not affected by this heuristic. 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CompactLangDet::DetectLanguageSummary(detection_tables, 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf8_encoded.c_str(), 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf8_encoded.length(), 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) is_plain_text, language3, percent3, 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &text_bytes_tmp, is_reliable); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Calcualte a number of languages detected in more than 20% of the text. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int kMinTextPercentToCountLanguage = 20; 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *num_languages = 0; 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (text_bytes) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *text_bytes = text_bytes_tmp; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) COMPILE_ASSERT(arraysize(language3) == arraysize(percent3), 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) language3_and_percent3_should_be_of_the_same_size); 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < arraysize(language3); ++i) { 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) && 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) percent3[i] >= kMinTextPercentToCountLanguage) { 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++*num_languages; 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return language3[0]; 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 86