csdetect.cpp revision ac04d0bbe12b3ef54518635711412f178cb4d16
1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 2005-2007, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucsdet.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csdetect.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csmatch.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uenumimp.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "umutex.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ucln_in.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uarrsort.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "inputext.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrsbcs.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrmbcs.h" 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrutf8.h" 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csrucode.h" 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "csr2022.h" 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DELETE_ARRAY(array) uprv_free((void *) (array)) 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL; 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t fCSRecognizers_size = 0; 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic UBool U_CALLCONV csdet_cleanup(void) 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers != NULL) { 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fCSRecognizers[r]; 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers[r] = NULL; 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(fCSRecognizers); 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers = NULL; 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers_size = 0; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerucharsetMatchComparator(const void * /*context*/, const void *left, const void *right) 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_NAMESPACE_USE 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const CharsetMatch **csm_l = (const CharsetMatch **) left; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const CharsetMatch **csm_r = (const CharsetMatch **) right; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // NOTE: compare is backwards to sort from highest to lowest. 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setRecognizers(UErrorCode &status) 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool needsInit; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer **recognizers; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (needsInit) { 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer *tempArray[] = { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF8(), 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_16_BE(), 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_16_LE(), 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_32_BE(), 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_UTF_32_LE(), 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_en(), 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_da(), 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_de(), 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_es(), 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_fr(), 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_it(), 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_nl(), 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_no(), 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_pt(), 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_1_sv(), 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_cs(), 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_hu(), 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_pl(), 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_2_ro(), 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_5_ru(), 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_6_ar(), 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_7_el(), 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_8_I_he(), 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_8_he(), 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_windows_1251(), 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_windows_1256(), 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_KOI8_R(), 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_8859_9_tr(), 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_sjis(), 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_gb_18030(), 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_euc_jp(), 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_euc_kr(), 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_big5(), 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022JP(), 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022KR(), 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru new CharsetRecog_2022CN() 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rCount = ARRAY_SIZE(tempArray); 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t r; 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (recognizers == NULL) { 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (r = 0; r < rCount; r += 1) { 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers[r] = tempArray[r]; 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (recognizers[r] == NULL) { 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_SUCCESS(status)) { 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru umtx_lock(NULL); 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers == NULL) { 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers = recognizers; 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fCSRecognizers_size = rCount; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru umtx_unlock(NULL); 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fCSRecognizers != recognizers) { 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (r = 0; r < rCount; r += 1) { 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete recognizers[r]; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers[r] = NULL; 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(recognizers); 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru recognizers = NULL; 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::CharsetDetector(UErrorCode &status) 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : textIn(new InputText()), resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setRecognizers(status); 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (resultArray == NULL) { 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[i] = new CharsetMatch(); 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (resultArray[i] == NULL) { 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruCharsetDetector::~CharsetDetector() 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete textIn; 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete resultArray[i]; 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(resultArray); 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setText(const char *in, int32_t len) 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->setText(in, len); 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = TRUE; 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::setStripTagsFlag(UBool flag) 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool temp = fStripTags; 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fStripTags = flag; 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = TRUE; 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return temp; 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool CharsetDetector::getStripTagsFlag() const 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fStripTags; 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->setDeclaredEncoding(encoding,len); 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t CharsetDetector::getDetectableCount() 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setRecognizers(status); 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers_size; 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch *CharsetDetector::detect(UErrorCode &status) 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t maxMatchesFound = 0; 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru detectAll(maxMatchesFound, status); 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(maxMatchesFound > 0) { 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return resultArray[0]; 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!textIn->isSet()) { 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(fFreshTextSet) { 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetRecognizer *csr; 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t detectResults; 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t confidence; 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i; 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textIn->MungeInput(fStripTags); 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Iterate over all possible charsets, remember all that 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // give a match quality > 0. 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultCount = 0; 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i = 0; i < fCSRecognizers_size; i += 1) { 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru csr = fCSRecognizers[i]; 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru detectResults = csr->match(textIn); 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru confidence = detectResults; 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (confidence > 0) { 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[resultCount++]->set(textIn, csr, confidence); 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i = resultCount; i < fCSRecognizers_size; i += 1) { 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru resultArray[i]->set(textIn, 0, 0); 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ////Bubble sort 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //for(int32_t i = resultCount; i > 1; i -= 1) { 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // for(int32_t j = 0; j < i-1; j += 1) { 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) { 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // CharsetMatch *temp = resultArray[j]; 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // resultArray[j] = resultArray[j+1]; 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // resultArray[j+1] = temp; 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // } 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // } 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //} 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFreshTextSet = FALSE; 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru maxMatchesFound = resultCount; 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return resultArray; 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( index > fCSRecognizers_size-1 || index < 0) { 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_INDEX_OUTOFBOUNDS_ERROR; 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers[index]->getName(); 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/ 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_BEGIN 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querutypedef struct { 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t currIndex; 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} Context; 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumClose(UEnumeration *en) { 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(en->context != NULL) { 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(en->context); 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DELETE_ARRAY(en); 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t U_CALLCONV 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumCount(UEnumeration *, UErrorCode *) { 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fCSRecognizers_size; 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const char* U_CALLCONV 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(resultLength != NULL) { 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *resultLength = 0; 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(resultLength != NULL) { 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *resultLength = (int32_t)uprv_strlen(currName); 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((Context *)en->context)->currIndex++; 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return currName; 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void U_CALLCONV 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruenumReset(UEnumeration *en, UErrorCode *) { 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((Context *)en->context)->currIndex = 0; 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UEnumeration gCSDetEnumeration = { 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NULL, 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NULL, 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumClose, 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumCount, 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uenum_unextDefault, 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumNext, 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enumReset 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UEnumeration * U_EXPORT2 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru U_NAMESPACE_USE 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(*status)) { 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* Initialize recognized charsets. */ 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru CharsetDetector::getDetectableCount(); 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru en->context = (void*)NEW_ARRAY(Context, 1); 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(en->context, 0, sizeof(Context)); 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return en; 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CDECL_END 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 392