1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Copyright (C) 2005-2009, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uenumimp.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "umutex.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ucln_in.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uarrsort.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "inputext.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrsbcs.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrmbcs.h" 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrutf8.h" 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrucode.h" 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csr2022.h" 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array)) 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL; 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t fCSRecognizers_size = 0; 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UBool U_CALLCONV csdet_cleanup(void) 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers != NULL) { 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fCSRecognizers[r]; 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers[r] = NULL; 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(fCSRecognizers); 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers = NULL; 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers_size = 0; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerucharsetMatchComparator(const void * /*context*/, const void *left, const void *right) 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_NAMESPACE_USE 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const CharsetMatch **csm_l = (const CharsetMatch **) left; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const CharsetMatch **csm_r = (const CharsetMatch **) right; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // NOTE: compare is backwards to sort from highest to lowest. 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setRecognizers(UErrorCode &status) 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool needsInit; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer **recognizers; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (needsInit) { 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer *tempArray[] = { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF8(), 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_16_BE(), 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_16_LE(), 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_32_BE(), 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_32_LE(), 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_en(), 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_da(), 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_de(), 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_es(), 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_fr(), 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_it(), 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_nl(), 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_no(), 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_pt(), 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_sv(), 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_cs(), 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_hu(), 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_pl(), 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_ro(), 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_5_ru(), 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_6_ar(), 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_7_el(), 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_8_I_he(), 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_8_he(), 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_windows_1251(), 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_windows_1256(), 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_KOI8_R(), 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_9_tr(), 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_sjis(), 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_gb_18030(), 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_euc_jp(), 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_euc_kr(), 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_big5(), 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022JP(), 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022KR(), 12385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_2022CN(), 12485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 12585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM424_he_rtl(), 12685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM424_he_ltr(), 12785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM420_ar_rtl(), 12885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM420_ar_ltr() 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rCount = ARRAY_SIZE(tempArray); 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t r; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (recognizers == NULL) { 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 13785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho return; 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (r = 0; r < rCount; r += 1) { 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers[r] = tempArray[r]; 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (recognizers[r] == NULL) { 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_SUCCESS(status)) { 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru umtx_lock(NULL); 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers == NULL) { 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers_size = rCount; 15385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fCSRecognizers = recognizers; 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru umtx_unlock(NULL); 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers != recognizers) { 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (r = 0; r < rCount; r += 1) { 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete recognizers[r]; 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers[r] = NULL; 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(recognizers); 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers = NULL; 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::CharsetDetector(UErrorCode &status) 17385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho : textIn(new InputText(status)), resultArray(NULL), 17485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setRecognizers(status); 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (resultArray == NULL) { 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[i] = new CharsetMatch(); 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (resultArray[i] == NULL) { 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::~CharsetDetector() 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete textIn; 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete resultArray[i]; 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(resultArray); 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setText(const char *in, int32_t len) 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->setText(in, len); 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = TRUE; 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::setStripTagsFlag(UBool flag) 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool temp = fStripTags; 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fStripTags = flag; 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = TRUE; 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return temp; 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::getStripTagsFlag() const 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fStripTags; 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->setDeclaredEncoding(encoding,len); 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t CharsetDetector::getDetectableCount() 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setRecognizers(status); 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers_size; 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch *CharsetDetector::detect(UErrorCode &status) 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxMatchesFound = 0; 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru detectAll(maxMatchesFound, status); 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxMatchesFound > 0) { 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return resultArray[0]; 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!textIn->isSet()) { 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(fFreshTextSet) { 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer *csr; 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t detectResults; 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t confidence; 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i; 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->MungeInput(fStripTags); 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Iterate over all possible charsets, remember all that 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // give a match quality > 0. 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultCount = 0; 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i = 0; i < fCSRecognizers_size; i += 1) { 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csr = fCSRecognizers[i]; 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru detectResults = csr->match(textIn); 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru confidence = detectResults; 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (confidence > 0) { 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[resultCount++]->set(textIn, csr, confidence); 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i = resultCount; i < fCSRecognizers_size; i += 1) { 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[i]->set(textIn, 0, 0); 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 29285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 29385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // Remove duplicate charsets from the results. 29485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // Simple minded, brute force approach - check each entry against all that follow. 29585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // The first entry of any duplicated set is the one that should be kept because it will 29685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // be the one with the highest confidence rating. 29785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // (Duplicate matches have different languages, only the charset is the same) 29885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually 29985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // deleted, just reordered, with the unwanted duplicates placed after the good results. 30085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho int32_t j, k; 30185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho for (i=0; i<resultCount; i++) { 30285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho const char *charSetName = resultArray[i]->getName(); 30385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho for (j=i+1; j<resultCount; ) { 30485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) { 30585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // Not a duplicate. 30685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho j++; 30785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } else { 30885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // Duplicate entry at index j. 30985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho CharsetMatch *duplicate = resultArray[j]; 31085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho for (k=j; k<resultCount-1; k++) { 31185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho resultArray[k] = resultArray[k+1]; 31285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 31385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho resultCount--; 31485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho resultArray[resultCount] = duplicate; 31585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 31685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 31785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = FALSE; 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxMatchesFound = resultCount; 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return resultArray; 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( index > fCSRecognizers_size-1 || index < 0) { 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_INDEX_OUTOFBOUNDS_ERROR; 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers[index]->getName(); 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/ 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutypedef struct { 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t currIndex; 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} Context; 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumClose(UEnumeration *en) { 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(en->context != NULL) { 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(en->context); 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(en); 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumCount(UEnumeration *, UErrorCode *) { 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers_size; 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const char* U_CALLCONV 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(resultLength != NULL) { 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *resultLength = 0; 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(resultLength != NULL) { 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *resultLength = (int32_t)uprv_strlen(currName); 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((Context *)en->context)->currIndex++; 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return currName; 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumReset(UEnumeration *en, UErrorCode *) { 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((Context *)en->context)->currIndex = 0; 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UEnumeration gCSDetEnumeration = { 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NULL, 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NULL, 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumClose, 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumCount, 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uenum_unextDefault, 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumNext, 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumReset 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UEnumeration * U_EXPORT2 39485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Houcsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_NAMESPACE_USE 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* Initialize recognized charsets. */ 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector::getDetectableCount(); 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru en->context = (void*)NEW_ARRAY(Context, 1); 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(en->context, 0, sizeof(Context)); 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return en; 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 415