1/* 2 ********************************************************************** 3 * Copyright (C) 2005-2013, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8#include "unicode/utypes.h" 9 10#if !UCONFIG_NO_CONVERSION 11 12#include "unicode/ucsdet.h" 13 14#include "csdetect.h" 15#include "csmatch.h" 16#include "uenumimp.h" 17 18#include "cmemory.h" 19#include "cstring.h" 20#include "umutex.h" 21#include "ucln_in.h" 22#include "uarrsort.h" 23#include "inputext.h" 24#include "csrsbcs.h" 25#include "csrmbcs.h" 26#include "csrutf8.h" 27#include "csrucode.h" 28#include "csr2022.h" 29 30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31 32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33#define DELETE_ARRAY(array) uprv_free((void *) (array)) 34 35U_NAMESPACE_BEGIN 36 37struct CSRecognizerInfo : public UMemory { 38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) 39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; 40 41 ~CSRecognizerInfo() {delete recognizer;}; 42 43 CharsetRecognizer *recognizer; 44 UBool isDefaultEnabled; 45}; 46 47U_NAMESPACE_END 48 49static icu::CSRecognizerInfo **fCSRecognizers = NULL; 50static icu::UInitOnce gCSRecognizersInitOnce; 51static int32_t fCSRecognizers_size = 0; 52 53U_CDECL_BEGIN 54static UBool U_CALLCONV csdet_cleanup(void) 55{ 56 U_NAMESPACE_USE 57 if (fCSRecognizers != NULL) { 58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 59 delete fCSRecognizers[r]; 60 fCSRecognizers[r] = NULL; 61 } 62 63 DELETE_ARRAY(fCSRecognizers); 64 fCSRecognizers = NULL; 65 fCSRecognizers_size = 0; 66 } 67 gCSRecognizersInitOnce.reset(); 68 69 return TRUE; 70} 71 72static int32_t U_CALLCONV 73charsetMatchComparator(const void * /*context*/, const void *left, const void *right) 74{ 75 U_NAMESPACE_USE 76 77 const CharsetMatch **csm_l = (const CharsetMatch **) left; 78 const CharsetMatch **csm_r = (const CharsetMatch **) right; 79 80 // NOTE: compare is backwards to sort from highest to lowest. 81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 82} 83 84static void U_CALLCONV initRecognizers(UErrorCode &status) { 85 U_NAMESPACE_USE 86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 87 CSRecognizerInfo *tempArray[] = { 88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), 89 90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), 91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), 92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), 93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), 94 95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), 96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), 97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), 98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), 99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), 100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), 101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), 102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), 103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), 104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), 105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), 106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), 107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), 108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), 109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), 110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), 111 112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), 113 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), 114 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), 115 116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), 117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), 118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), 119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) 120 }; 121 int32_t rCount = ARRAY_SIZE(tempArray); 122 123 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); 124 125 if (fCSRecognizers == NULL) { 126 status = U_MEMORY_ALLOCATION_ERROR; 127 } 128 else { 129 fCSRecognizers_size = rCount; 130 for (int32_t r = 0; r < rCount; r += 1) { 131 fCSRecognizers[r] = tempArray[r]; 132 if (fCSRecognizers[r] == NULL) { 133 status = U_MEMORY_ALLOCATION_ERROR; 134 } 135 } 136 } 137} 138 139U_CDECL_END 140 141U_NAMESPACE_BEGIN 142 143void CharsetDetector::setRecognizers(UErrorCode &status) 144{ 145 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); 146} 147 148CharsetDetector::CharsetDetector(UErrorCode &status) 149 : textIn(new InputText(status)), resultArray(NULL), 150 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), 151 fEnabledRecognizers(NULL) 152{ 153 if (U_FAILURE(status)) { 154 return; 155 } 156 157 setRecognizers(status); 158 159 if (U_FAILURE(status)) { 160 return; 161 } 162 163 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 164 165 if (resultArray == NULL) { 166 status = U_MEMORY_ALLOCATION_ERROR; 167 return; 168 } 169 170 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 171 resultArray[i] = new CharsetMatch(); 172 173 if (resultArray[i] == NULL) { 174 status = U_MEMORY_ALLOCATION_ERROR; 175 break; 176 } 177 } 178} 179 180CharsetDetector::~CharsetDetector() 181{ 182 delete textIn; 183 184 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 185 delete resultArray[i]; 186 } 187 188 uprv_free(resultArray); 189 190 if (fEnabledRecognizers) { 191 uprv_free(fEnabledRecognizers); 192 } 193} 194 195void CharsetDetector::setText(const char *in, int32_t len) 196{ 197 textIn->setText(in, len); 198 fFreshTextSet = TRUE; 199} 200 201UBool CharsetDetector::setStripTagsFlag(UBool flag) 202{ 203 UBool temp = fStripTags; 204 fStripTags = flag; 205 fFreshTextSet = TRUE; 206 return temp; 207} 208 209UBool CharsetDetector::getStripTagsFlag() const 210{ 211 return fStripTags; 212} 213 214void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 215{ 216 textIn->setDeclaredEncoding(encoding,len); 217} 218 219int32_t CharsetDetector::getDetectableCount() 220{ 221 UErrorCode status = U_ZERO_ERROR; 222 223 setRecognizers(status); 224 225 return fCSRecognizers_size; 226} 227 228const CharsetMatch *CharsetDetector::detect(UErrorCode &status) 229{ 230 int32_t maxMatchesFound = 0; 231 232 detectAll(maxMatchesFound, status); 233 234 if(maxMatchesFound > 0) { 235 return resultArray[0]; 236 } else { 237 return NULL; 238 } 239} 240 241const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 242{ 243 if(!textIn->isSet()) { 244 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 245 246 return NULL; 247 } else if (fFreshTextSet) { 248 CharsetRecognizer *csr; 249 int32_t i; 250 251 textIn->MungeInput(fStripTags); 252 253 // Iterate over all possible charsets, remember all that 254 // give a match quality > 0. 255 resultCount = 0; 256 for (i = 0; i < fCSRecognizers_size; i += 1) { 257 csr = fCSRecognizers[i]->recognizer; 258 if (csr->match(textIn, resultArray[resultCount])) { 259 resultCount++; 260 } 261 } 262 263 if (resultCount > 1) { 264 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 265 } 266 fFreshTextSet = FALSE; 267 } 268 269 maxMatchesFound = resultCount; 270 271 return resultArray; 272} 273 274void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) 275{ 276 if (U_FAILURE(status)) { 277 return; 278 } 279 280 int32_t modIdx = -1; 281 UBool isDefaultVal = FALSE; 282 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 283 CSRecognizerInfo *csrinfo = fCSRecognizers[i]; 284 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { 285 modIdx = i; 286 isDefaultVal = (csrinfo->isDefaultEnabled == enabled); 287 break; 288 } 289 } 290 if (modIdx < 0) { 291 // No matching encoding found 292 status = U_ILLEGAL_ARGUMENT_ERROR; 293 return; 294 } 295 296 if (fEnabledRecognizers == NULL && !isDefaultVal) { 297 // Create an array storing the non default setting 298 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); 299 if (fEnabledRecognizers == NULL) { 300 status = U_MEMORY_ALLOCATION_ERROR; 301 return; 302 } 303 // Initialize the array with default info 304 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 305 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; 306 } 307 } 308 309 if (fEnabledRecognizers != NULL) { 310 fEnabledRecognizers[modIdx] = enabled; 311 } 312} 313 314/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 315{ 316 if( index > fCSRecognizers_size-1 || index < 0) { 317 status = U_INDEX_OUTOFBOUNDS_ERROR; 318 319 return 0; 320 } else { 321 return fCSRecognizers[index]->getName(); 322 } 323}*/ 324 325U_NAMESPACE_END 326 327U_CDECL_BEGIN 328typedef struct { 329 int32_t currIndex; 330 UBool all; 331 UBool *enabledRecognizers; 332} Context; 333 334 335 336static void U_CALLCONV 337enumClose(UEnumeration *en) { 338 if(en->context != NULL) { 339 DELETE_ARRAY(en->context); 340 } 341 342 DELETE_ARRAY(en); 343} 344 345static int32_t U_CALLCONV 346enumCount(UEnumeration *en, UErrorCode *) { 347 if (((Context *)en->context)->all) { 348 // ucsdet_getAllDetectableCharsets, all charset detector names 349 return fCSRecognizers_size; 350 } 351 352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones 353 int32_t count = 0; 354 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 355 if (enabledArray != NULL) { 356 // custom set 357 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 358 if (enabledArray[i]) { 359 count++; 360 } 361 } 362 } else { 363 // default set 364 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 365 if (fCSRecognizers[i]->isDefaultEnabled) { 366 count++; 367 } 368 } 369 } 370 return count; 371} 372 373static const char* U_CALLCONV 374enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 375 const char *currName = NULL; 376 377 if (((Context *)en->context)->currIndex < fCSRecognizers_size) { 378 if (((Context *)en->context)->all) { 379 // ucsdet_getAllDetectableCharsets, all charset detector names 380 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 381 ((Context *)en->context)->currIndex++; 382 } else { 383 // ucsdet_getDetectableCharsets 384 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 385 if (enabledArray != NULL) { 386 // custome set 387 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 388 if (enabledArray[((Context *)en->context)->currIndex]) { 389 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 390 } 391 ((Context *)en->context)->currIndex++; 392 } 393 } else { 394 // default set 395 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 396 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { 397 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 398 } 399 ((Context *)en->context)->currIndex++; 400 } 401 } 402 } 403 } 404 405 if(resultLength != NULL) { 406 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); 407 } 408 409 return currName; 410} 411 412 413static void U_CALLCONV 414enumReset(UEnumeration *en, UErrorCode *) { 415 ((Context *)en->context)->currIndex = 0; 416} 417 418static const UEnumeration gCSDetEnumeration = { 419 NULL, 420 NULL, 421 enumClose, 422 enumCount, 423 uenum_unextDefault, 424 enumNext, 425 enumReset 426}; 427 428U_CDECL_END 429 430U_NAMESPACE_BEGIN 431 432UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) 433{ 434 435 /* Initialize recognized charsets. */ 436 setRecognizers(status); 437 438 if(U_FAILURE(status)) { 439 return 0; 440 } 441 442 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 443 if (en == NULL) { 444 status = U_MEMORY_ALLOCATION_ERROR; 445 return 0; 446 } 447 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 448 en->context = (void*)NEW_ARRAY(Context, 1); 449 if (en->context == NULL) { 450 status = U_MEMORY_ALLOCATION_ERROR; 451 DELETE_ARRAY(en); 452 return 0; 453 } 454 uprv_memset(en->context, 0, sizeof(Context)); 455 ((Context*)en->context)->all = TRUE; 456 return en; 457} 458 459UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const 460{ 461 if(U_FAILURE(status)) { 462 return 0; 463 } 464 465 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 466 if (en == NULL) { 467 status = U_MEMORY_ALLOCATION_ERROR; 468 return 0; 469 } 470 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 471 en->context = (void*)NEW_ARRAY(Context, 1); 472 if (en->context == NULL) { 473 status = U_MEMORY_ALLOCATION_ERROR; 474 DELETE_ARRAY(en); 475 return 0; 476 } 477 uprv_memset(en->context, 0, sizeof(Context)); 478 ((Context*)en->context)->all = FALSE; 479 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; 480 return en; 481} 482 483U_NAMESPACE_END 484 485#endif 486