1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Copyright (C) 2005-2012, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uenumimp.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "umutex.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ucln_in.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uarrsort.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "inputext.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrsbcs.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrmbcs.h" 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrutf8.h" 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrucode.h" 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csr2022.h" 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array)) 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 36103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliusstatic icu::CharsetRecognizer **fCSRecognizers = NULL; 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t fCSRecognizers_size = 0; 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UBool U_CALLCONV csdet_cleanup(void) 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers != NULL) { 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fCSRecognizers[r]; 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers[r] = NULL; 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(fCSRecognizers); 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers = NULL; 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers_size = 0; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerucharsetMatchComparator(const void * /*context*/, const void *left, const void *right) 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_NAMESPACE_USE 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const CharsetMatch **csm_l = (const CharsetMatch **) left; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const CharsetMatch **csm_r = (const CharsetMatch **) right; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // NOTE: compare is backwards to sort from highest to lowest. 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setRecognizers(UErrorCode &status) 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool needsInit; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer **recognizers; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (needsInit) { 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer *tempArray[] = { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF8(), 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_16_BE(), 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_16_LE(), 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_32_BE(), 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_32_LE(), 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 9254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius new CharsetRecog_8859_1(), 9354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius new CharsetRecog_8859_2(), 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_5_ru(), 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_6_ar(), 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_7_el(), 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_8_I_he(), 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_8_he(), 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_windows_1251(), 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_windows_1256(), 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_KOI8_R(), 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_9_tr(), 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_sjis(), 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_gb_18030(), 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_euc_jp(), 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_euc_kr(), 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_big5(), 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022JP(), 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022KR(), 11185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_2022CN(), 11285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 11385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM424_he_rtl(), 11485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM424_he_ltr(), 11585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM420_ar_rtl(), 11685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho new CharsetRecog_IBM420_ar_ltr() 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rCount = ARRAY_SIZE(tempArray); 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t r; 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (recognizers == NULL) { 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 12585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho return; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (r = 0; r < rCount; r += 1) { 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers[r] = tempArray[r]; 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (recognizers[r] == NULL) { 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_SUCCESS(status)) { 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru umtx_lock(NULL); 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers == NULL) { 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers_size = rCount; 14185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fCSRecognizers = recognizers; 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru umtx_unlock(NULL); 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers != recognizers) { 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (r = 0; r < rCount; r += 1) { 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete recognizers[r]; 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers[r] = NULL; 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(recognizers); 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers = NULL; 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::CharsetDetector(UErrorCode &status) 16185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho : textIn(new InputText(status)), resultArray(NULL), 16285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setRecognizers(status); 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (resultArray == NULL) { 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[i] = new CharsetMatch(); 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (resultArray[i] == NULL) { 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::~CharsetDetector() 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete textIn; 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete resultArray[i]; 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(resultArray); 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setText(const char *in, int32_t len) 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->setText(in, len); 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = TRUE; 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::setStripTagsFlag(UBool flag) 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool temp = fStripTags; 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fStripTags = flag; 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = TRUE; 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return temp; 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::getStripTagsFlag() const 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fStripTags; 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->setDeclaredEncoding(encoding,len); 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t CharsetDetector::getDetectableCount() 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setRecognizers(status); 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers_size; 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch *CharsetDetector::detect(UErrorCode &status) 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxMatchesFound = 0; 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru detectAll(maxMatchesFound, status); 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxMatchesFound > 0) { 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return resultArray[0]; 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!textIn->isSet()) { 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 25454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } else if (fFreshTextSet) { 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer *csr; 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->MungeInput(fStripTags); 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Iterate over all possible charsets, remember all that 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // give a match quality > 0. 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultCount = 0; 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i = 0; i < fCSRecognizers_size; i += 1) { 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csr = fCSRecognizers[i]; 26554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (csr->match(textIn, resultArray[resultCount])) { 26654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius resultCount++; 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 27054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (resultCount > 1) { 27154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 27285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = FALSE; 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxMatchesFound = resultCount; 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return resultArray; 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( index > fCSRecognizers_size-1 || index < 0) { 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_INDEX_OUTOFBOUNDS_ERROR; 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers[index]->getName(); 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/ 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutypedef struct { 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t currIndex; 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} Context; 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumClose(UEnumeration *en) { 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(en->context != NULL) { 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(en->context); 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(en); 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumCount(UEnumeration *, UErrorCode *) { 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers_size; 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const char* U_CALLCONV 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(resultLength != NULL) { 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *resultLength = 0; 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(resultLength != NULL) { 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *resultLength = (int32_t)uprv_strlen(currName); 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((Context *)en->context)->currIndex++; 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return currName; 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumReset(UEnumeration *en, UErrorCode *) { 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((Context *)en->context)->currIndex = 0; 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UEnumeration gCSDetEnumeration = { 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NULL, 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NULL, 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumClose, 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumCount, 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uenum_unextDefault, 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumNext, 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumReset 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UEnumeration * U_EXPORT2 34885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Houcsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_NAMESPACE_USE 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* Initialize recognized charsets. */ 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector::getDetectableCount(); 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru en->context = (void*)NEW_ARRAY(Context, 1); 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(en->context, 0, sizeof(Context)); 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return en; 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 369