1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/* 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ********************************************************************** 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * Copyright (C) 2005-2009, International Business Machines 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott * Corporation and others. All Rights Reserved. 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ********************************************************************** 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott */ 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/utypes.h" 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if !UCONFIG_NO_CONVERSION 11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/ucsdet.h" 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csdetect.h" 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csmatch.h" 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "uenumimp.h" 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cmemory.h" 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "cstring.h" 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "umutex.h" 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "ucln_in.h" 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "uarrsort.h" 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "inputext.h" 24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrsbcs.h" 25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrmbcs.h" 26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrutf8.h" 27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csrucode.h" 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "csr2022.h" 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#define DELETE_ARRAY(array) uprv_free((void *) (array)) 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_BEGIN 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL; 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic int32_t fCSRecognizers_size = 0; 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic UBool U_CALLCONV csdet_cleanup(void) 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (fCSRecognizers != NULL) { 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete fCSRecognizers[r]; 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fCSRecognizers[r] = NULL; 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(fCSRecognizers); 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fCSRecognizers = NULL; 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fCSRecognizers_size = 0; 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return TRUE; 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic int32_t U_CALLCONV 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottcharsetMatchComparator(const void * /*context*/, const void *left, const void *right) 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott U_NAMESPACE_USE 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CharsetMatch **csm_l = (const CharsetMatch **) left; 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const CharsetMatch **csm_r = (const CharsetMatch **) right; 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // NOTE: compare is backwards to sort from highest to lowest. 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_END 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_BEGIN 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CharsetDetector::setRecognizers(UErrorCode &status) 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool needsInit; 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CharsetRecognizer **recognizers; 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status)) { 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (needsInit) { 84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CharsetRecognizer *tempArray[] = { 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_UTF8(), 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_UTF_16_BE(), 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_UTF_16_LE(), 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_UTF_32_BE(), 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_UTF_32_LE(), 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_en(), 93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_da(), 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_de(), 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_es(), 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_fr(), 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_it(), 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_nl(), 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_no(), 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_pt(), 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_1_sv(), 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_2_cs(), 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_2_hu(), 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_2_pl(), 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_2_ro(), 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_5_ru(), 107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_6_ar(), 108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_7_el(), 109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_8_I_he(), 110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_8_he(), 111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_windows_1251(), 112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_windows_1256(), 113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_KOI8_R(), 114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_8859_9_tr(), 115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_sjis(), 116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_gb_18030(), 117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_euc_jp(), 118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_euc_kr(), 119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_big5(), 120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_2022JP(), 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_2022KR(), 123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_2022CN(), 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_IBM424_he_rtl(), 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_IBM424_he_ltr(), 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_IBM420_ar_rtl(), 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott new CharsetRecog_IBM420_ar_ltr() 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott }; 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t rCount = ARRAY_SIZE(tempArray); 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t r; 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); 134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (recognizers == NULL) { 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (r = 0; r < rCount; r += 1) { 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott recognizers[r] = tempArray[r]; 141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (recognizers[r] == NULL) { 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_SUCCESS(status)) { 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott umtx_lock(NULL); 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (fCSRecognizers == NULL) { 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fCSRecognizers_size = rCount; 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fCSRecognizers = recognizers; 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott umtx_unlock(NULL); 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (fCSRecognizers != recognizers) { 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (r = 0; r < rCount; r += 1) { 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete recognizers[r]; 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott recognizers[r] = NULL; 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(recognizers); 165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott recognizers = NULL; 168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottCharsetDetector::CharsetDetector(UErrorCode &status) 173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott : textIn(new InputText(status)), resultArray(NULL), 174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) 175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status)) { 177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott setRecognizers(status); 181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (U_FAILURE(status)) { 183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (resultArray == NULL) { 189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return; 191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultArray[i] = new CharsetMatch(); 195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (resultArray[i] == NULL) { 197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MEMORY_ALLOCATION_ERROR; 198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott break; 199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottCharsetDetector::~CharsetDetector() 204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete textIn; 206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott delete resultArray[i]; 209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uprv_free(resultArray); 212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CharsetDetector::setText(const char *in, int32_t len) 215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott textIn->setText(in, len); 217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fFreshTextSet = TRUE; 218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool CharsetDetector::setStripTagsFlag(UBool flag) 221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UBool temp = fStripTags; 223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fStripTags = flag; 224c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fFreshTextSet = TRUE; 225c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return temp; 226c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottUBool CharsetDetector::getStripTagsFlag() const 229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return fStripTags; 231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott textIn->setDeclaredEncoding(encoding,len); 236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottint32_t CharsetDetector::getDetectableCount() 239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott setRecognizers(status); 243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return fCSRecognizers_size; 245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst CharsetMatch *CharsetDetector::detect(UErrorCode &status) 248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t maxMatchesFound = 0; 250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott detectAll(maxMatchesFound, status); 252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(maxMatchesFound > 0) { 254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return resultArray[0]; 255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return NULL; 257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(!textIn->isSet()) { 263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return NULL; 266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else if(fFreshTextSet) { 267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CharsetRecognizer *csr; 268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t detectResults; 269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t confidence; 270c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t i; 271c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 272c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott textIn->MungeInput(fStripTags); 273c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 274c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Iterate over all possible charsets, remember all that 275c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // give a match quality > 0. 276c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultCount = 0; 277c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (i = 0; i < fCSRecognizers_size; i += 1) { 278c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott csr = fCSRecognizers[i]; 279c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott detectResults = csr->match(textIn); 280c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott confidence = detectResults; 281c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 282c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (confidence > 0) { 283c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultArray[resultCount++]->set(textIn, csr, confidence); 284c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 285c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 286c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 287c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for(i = resultCount; i < fCSRecognizers_size; i += 1) { 288c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultArray[i]->set(textIn, 0, 0); 289c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 290c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 291c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 292c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 293c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Remove duplicate charsets from the results. 294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Simple minded, brute force approach - check each entry against all that follow. 295c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // The first entry of any duplicated set is the one that should be kept because it will 296c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // be the one with the highest confidence rating. 297c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // (Duplicate matches have different languages, only the charset is the same) 298c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually 299c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // deleted, just reordered, with the unwanted duplicates placed after the good results. 300c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t j, k; 301c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (i=0; i<resultCount; i++) { 302c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char *charSetName = resultArray[i]->getName(); 303c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (j=i+1; j<resultCount; ) { 304c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) { 305c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Not a duplicate. 306c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott j++; 307c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 308c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Duplicate entry at index j. 309c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CharsetMatch *duplicate = resultArray[j]; 310c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (k=j; k<resultCount-1; k++) { 311c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultArray[k] = resultArray[k+1]; 312c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 313c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultCount--; 314c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott resultArray[resultCount] = duplicate; 315c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 316c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 317c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 318c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 319c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott fFreshTextSet = FALSE; 320c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 321c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 322c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott maxMatchesFound = resultCount; 323c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 324c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return resultArray; 325c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 326c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 327c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 328c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 329c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if( index > fCSRecognizers_size-1 || index < 0) { 330c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status = U_INDEX_OUTOFBOUNDS_ERROR; 331c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 332c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return 0; 333c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } else { 334c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return fCSRecognizers[index]->getName(); 335c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 336c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}*/ 337c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 338c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_NAMESPACE_END 339c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 340c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_BEGIN 341c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttypedef struct { 342c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int32_t currIndex; 343c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} Context; 344c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 345c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 346c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 347c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic void U_CALLCONV 348c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumClose(UEnumeration *en) { 349c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(en->context != NULL) { 350c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(en->context); 351c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 352c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 353c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DELETE_ARRAY(en); 354c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 355c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 356c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic int32_t U_CALLCONV 357c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumCount(UEnumeration *, UErrorCode *) { 358c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return fCSRecognizers_size; 359c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 360c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 361c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic const char* U_CALLCONV 362c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 363c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { 364c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(resultLength != NULL) { 365c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *resultLength = 0; 366c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 367c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return NULL; 368c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 369c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); 370c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(resultLength != NULL) { 371c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott *resultLength = (int32_t)uprv_strlen(currName); 372c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 373c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ((Context *)en->context)->currIndex++; 374c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 375c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return currName; 376c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 377c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 378c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic void U_CALLCONV 379c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottenumReset(UEnumeration *en, UErrorCode *) { 380c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ((Context *)en->context)->currIndex = 0; 381c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 382c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 383c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottstatic const UEnumeration gCSDetEnumeration = { 384c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, 385c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NULL, 386c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott enumClose, 387c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott enumCount, 388c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uenum_unextDefault, 389c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott enumNext, 390c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott enumReset 391c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 392c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 393c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CAPI UEnumeration * U_EXPORT2 394c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) 395c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott{ 396c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott U_NAMESPACE_USE 397c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 398c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if(U_FAILURE(*status)) { 399c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return 0; 400c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 401c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 402c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott /* Initialize recognized charsets. */ 403c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott CharsetDetector::getDetectableCount(); 404c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 405c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 406c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 407c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott en->context = (void*)NEW_ARRAY(Context, 1); 408c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uprv_memset(en->context, 0, sizeof(Context)); 409c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return en; 410c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 411c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottU_CDECL_END 412c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 413c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 414c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 415