1/* 2 **************************************************************************** 3 * Copyright (c) 2005-2009, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 **************************************************************************** 6 */ 7 8#include "unicode/utypes.h" 9 10#include "unicode/ucsdet.h" 11#include "unicode/ucnv.h" 12#include "unicode/ustring.h" 13 14#include "cintltst.h" 15 16#include <stdlib.h> 17#include <string.h> 18 19#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 20 21#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type)) 22#define DELETE_ARRAY(array) free(array) 23 24static void TestConstruction(void); 25static void TestUTF8(void); 26static void TestUTF16(void); 27static void TestC1Bytes(void); 28static void TestInputFilter(void); 29static void TestChaining(void); 30static void TestBufferOverflow(void); 31static void TestIBM424(void); 32static void TestIBM420(void); 33 34void addUCsdetTest(TestNode** root); 35 36void addUCsdetTest(TestNode** root) 37{ 38 addTest(root, &TestConstruction, "ucsdetst/TestConstruction"); 39 addTest(root, &TestUTF8, "ucsdetst/TestUTF8"); 40 addTest(root, &TestUTF16, "ucsdetst/TestUTF16"); 41 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes"); 42 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); 43 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); 44 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow"); 45 addTest(root, &TestIBM424, "ucsdetst/TestIBM424"); 46 addTest(root, &TestIBM420, "ucsdetst/TestIBM420"); 47} 48 49static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) 50{ 51 UErrorCode status; 52 char buffer[1024]; 53 char *dest, *destLimit = buffer + sizeof(buffer); 54 const UChar *srcLimit = src + length; 55 int32_t result = 0; 56 57 do { 58 dest = buffer; 59 status = U_ZERO_ERROR; 60 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); 61 result += (int32_t) (dest - buffer); 62 } while (status == U_BUFFER_OVERFLOW_ERROR); 63 64 return result; 65} 66 67static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength) 68{ 69 UErrorCode status = U_ZERO_ERROR; 70 UConverter *cnv = ucnv_open(codepage, &status); 71 int32_t byteCount = preflight(src, length, cnv); 72 const UChar *srcLimit = src + length; 73 char *bytes = NEW_ARRAY(char, byteCount + 1); 74 char *dest = bytes, *destLimit = bytes + byteCount + 1; 75 76 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); 77 ucnv_close(cnv); 78 79 *byteLength = byteCount; 80 return bytes; 81} 82 83static void freeBytes(char *bytes) 84{ 85 DELETE_ARRAY(bytes); 86} 87 88static void TestConstruction(void) 89{ 90 UErrorCode status = U_ZERO_ERROR; 91 UCharsetDetector *csd = ucsdet_open(&status); 92 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); 93 const char *name; 94 int32_t count = uenum_count(e, &status); 95 int32_t i, length; 96 97 for(i = 0; i < count; i += 1) { 98 name = uenum_next(e, &length, &status); 99 100 if(name == NULL || length <= 0) { 101 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n"); 102 } 103 } 104 /* one past the list of all names must return NULL */ 105 name = uenum_next(e, &length, &status); 106 if(name != NULL || length != 0 || U_FAILURE(status)) { 107 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n"); 108 } 109 110 uenum_close(e); 111 ucsdet_close(csd); 112} 113 114static void TestUTF8(void) 115{ 116 UErrorCode status = U_ZERO_ERROR; 117 static const char ss[] = "This is a string with some non-ascii characters that will " 118 "be converted to UTF-8, then shoved through the detection process. " 119 "\\u0391\\u0392\\u0393\\u0394\\u0395" 120 "Sure would be nice if our source could contain Unicode directly!"; 121 int32_t byteLength = 0, sLength = 0, dLength = 0; 122 UChar s[sizeof(ss)]; 123 char *bytes; 124 UCharsetDetector *csd = ucsdet_open(&status); 125 const UCharsetMatch *match; 126 UChar detected[sizeof(ss)]; 127 128 sLength = u_unescape(ss, s, sizeof(ss)); 129 bytes = extractBytes(s, sLength, "UTF-8", &byteLength); 130 131 ucsdet_setText(csd, bytes, byteLength, &status); 132 if (U_FAILURE(status)) { 133 log_err("status is %s\n", u_errorName(status)); 134 goto bail; 135 } 136 137 match = ucsdet_detect(csd, &status); 138 139 if (match == NULL) { 140 log_err("Detection failure for UTF-8: got no matches.\n"); 141 goto bail; 142 } 143 144 dLength = ucsdet_getUChars(match, detected, sLength, &status); 145 146 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { 147 log_err("Round-trip test failed!\n"); 148 } 149 150 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 151 152bail: 153 freeBytes(bytes); 154 ucsdet_close(csd); 155} 156 157static void TestUTF16(void) 158{ 159 UErrorCode status = U_ZERO_ERROR; 160 /* Notice the BOM on the start of this string */ 161 static const UChar chars[] = { 162 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 163 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 164 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 165 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 166 0x064a, 0x062a, 0x0000}; 167 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars); 168 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); 169 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); 170 UCharsetDetector *csd = ucsdet_open(&status); 171 const UCharsetMatch *match; 172 const char *name; 173 int32_t conf; 174 175 ucsdet_setText(csd, beBytes, beLength, &status); 176 match = ucsdet_detect(csd, &status); 177 178 if (match == NULL) { 179 log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); 180 goto try_le; 181 } 182 183 name = ucsdet_getName(match, &status); 184 conf = ucsdet_getConfidence(match, &status); 185 186 if (strcmp(name, "UTF-16BE") != 0) { 187 log_err("Encoding detection failure for UTF-16BE: got %s\n", name); 188 } 189 190 if (conf != 100) { 191 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); 192 } 193 194try_le: 195 ucsdet_setText(csd, leBytes, leLength, &status); 196 match = ucsdet_detect(csd, &status); 197 198 if (match == NULL) { 199 log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); 200 goto bail; 201 } 202 203 name = ucsdet_getName(match, &status); 204 conf = ucsdet_getConfidence(match, &status); 205 206 207 if (strcmp(name, "UTF-16LE") != 0) { 208 log_err("Enconding detection failure for UTF-16LE: got %s\n", name); 209 } 210 211 if (conf != 100) { 212 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); 213 } 214 215bail: 216 freeBytes(leBytes); 217 freeBytes(beBytes); 218 ucsdet_close(csd); 219} 220 221static void TestC1Bytes(void) 222{ 223#if !UCONFIG_NO_LEGACY_CONVERSION 224 UErrorCode status = U_ZERO_ERROR; 225 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 226 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes."; 227 int32_t sISOLength = 0, sWindowsLength = 0; 228 UChar sISO[sizeof(ssISO)]; 229 UChar sWindows[sizeof(ssWindows)]; 230 int32_t lISO = 0, lWindows = 0; 231 char *bISO; 232 char *bWindows; 233 UCharsetDetector *csd = ucsdet_open(&status); 234 const UCharsetMatch *match; 235 const char *name; 236 237 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO)); 238 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows)); 239 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); 240 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows); 241 242 ucsdet_setText(csd, bWindows, lWindows, &status); 243 match = ucsdet_detect(csd, &status); 244 245 if (match == NULL) { 246 log_err("English test with C1 bytes got no matches.\n"); 247 goto bail; 248 } 249 250 name = ucsdet_getName(match, &status); 251 252 if (strcmp(name, "windows-1252") != 0) { 253 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name); 254 } 255 256 ucsdet_setText(csd, bISO, lISO, &status); 257 match = ucsdet_detect(csd, &status); 258 259 if (match == NULL) { 260 log_err("English text without C1 bytes got no matches.\n"); 261 goto bail; 262 } 263 264 name = ucsdet_getName(match, &status); 265 266 if (strcmp(name, "ISO-8859-1") != 0) { 267 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name); 268 } 269 270bail: 271 freeBytes(bWindows); 272 freeBytes(bISO); 273 274 ucsdet_close(csd); 275#endif 276} 277 278static void TestInputFilter(void) 279{ 280 UErrorCode status = U_ZERO_ERROR; 281 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 282 int32_t sLength = 0; 283 UChar s[sizeof(ss)]; 284 int32_t byteLength = 0; 285 char *bytes; 286 UCharsetDetector *csd = ucsdet_open(&status); 287 const UCharsetMatch *match; 288 const char *lang, *name; 289 290 sLength = u_unescape(ss, s, sizeof(ss)); 291 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); 292 293 ucsdet_enableInputFilter(csd, TRUE); 294 295 if (!ucsdet_isInputFilterEnabled(csd)) { 296 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n"); 297 } 298 299 300 ucsdet_setText(csd, bytes, byteLength, &status); 301 match = ucsdet_detect(csd, &status); 302 303 if (match == NULL) { 304 log_err("Turning on the input filter resulted in no matches.\n"); 305 goto turn_off; 306 } 307 308 name = ucsdet_getName(match, &status); 309 310 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 311 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); 312 } else { 313 lang = ucsdet_getLanguage(match, &status); 314 315 if (lang == NULL || strcmp(lang, "fr") != 0) { 316 log_err("Input filter did not strip markup!\n"); 317 } 318 } 319 320turn_off: 321 ucsdet_enableInputFilter(csd, FALSE); 322 ucsdet_setText(csd, bytes, byteLength, &status); 323 match = ucsdet_detect(csd, &status); 324 325 if (match == NULL) { 326 log_err("Turning off the input filter resulted in no matches.\n"); 327 goto bail; 328 } 329 330 name = ucsdet_getName(match, &status); 331 332 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 333 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); 334 } else { 335 lang = ucsdet_getLanguage(match, &status); 336 337 if (lang == NULL || strcmp(lang, "en") != 0) { 338 log_err("Unfiltered input did not detect as English!\n"); 339 } 340 } 341 342bail: 343 freeBytes(bytes); 344 ucsdet_close(csd); 345} 346 347static void TestChaining(void) { 348 UErrorCode status = U_USELESS_COLLATOR_ERROR; 349 350 ucsdet_open(&status); 351 ucsdet_setText(NULL, NULL, 0, &status); 352 ucsdet_getName(NULL, &status); 353 ucsdet_getConfidence(NULL, &status); 354 ucsdet_getLanguage(NULL, &status); 355 ucsdet_detect(NULL, &status); 356 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status); 357 ucsdet_detectAll(NULL, NULL, &status); 358 ucsdet_getUChars(NULL, NULL, 0, &status); 359 ucsdet_getUChars(NULL, NULL, 0, &status); 360 ucsdet_close(NULL); 361 362 /* All of this code should have done nothing. */ 363 if (status != U_USELESS_COLLATOR_ERROR) { 364 log_err("Status got changed to %s\n", u_errorName(status)); 365 } 366} 367 368static void TestBufferOverflow(void) { 369 UErrorCode status = U_ZERO_ERROR; 370 static const char *testStrings[] = { 371 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */ 372 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */ 373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */ 374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */ 375 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */ 376 "\xa1", /* Could be a single byte shift-jis at the end */ 377 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */ 378 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */ 379 }; 380 static const char *testResults[] = { 381 "windows-1252", 382 "windows-1252", 383 "windows-1252", 384 "windows-1252", 385 "ISO-2022-JP", 386 NULL, 387 NULL, 388 "ISO-8859-1" 389 }; 390 int32_t idx = 0; 391 UCharsetDetector *csd = ucsdet_open(&status); 392 const UCharsetMatch *match; 393 394 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status); 395 396 if (U_FAILURE(status)) { 397 log_err("Couldn't open detector. %s\n", u_errorName(status)); 398 goto bail; 399 } 400 401 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) { 402 ucsdet_setText(csd, testStrings[idx], -1, &status); 403 match = ucsdet_detect(csd, &status); 404 405 if (match == NULL) { 406 if (testResults[idx] != NULL) { 407 log_err("Unexpectedly got no results at index %d.\n", idx); 408 } 409 else { 410 log_verbose("Got no result as expected at index %d.\n", idx); 411 } 412 continue; 413 } 414 415 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) { 416 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n", 417 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status)); 418 goto bail; 419 } 420 } 421 422bail: 423 ucsdet_close(csd); 424} 425 426static void TestIBM424(void) 427{ 428 UErrorCode status = U_ZERO_ERROR; 429 430 static const UChar chars[] = { 431 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 432 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 433 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 434 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 435 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 436 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 437 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 438 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 439 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 440 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 441 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 442 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 443 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 444 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 445 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 446 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 447 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 448 }; 449 450 static const UChar chars_reverse[] = { 451 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 452 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 453 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 454 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 455 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 456 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 457 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 458 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 459 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 460 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 461 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 462 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 463 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 464 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 465 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 466 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 467 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 468 0x0000 469 }; 470 471 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse); 472 473 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); 474 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength); 475 476 UCharsetDetector *csd = ucsdet_open(&status); 477 const UCharsetMatch *match; 478 const char *name; 479 480 ucsdet_setText(csd, bytes, bLength, &status); 481 match = ucsdet_detect(csd, &status); 482 483 if (match == NULL) { 484 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n"); 485 goto bail; 486 } 487 488 name = ucsdet_getName(match, &status); 489 if (strcmp(name, "IBM424_rtl") != 0) { 490 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name); 491 } 492 493 ucsdet_setText(csd, bytes_r, brLength, &status); 494 match = ucsdet_detect(csd, &status); 495 496 if (match == NULL) { 497 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n"); 498 goto bail; 499 } 500 501 name = ucsdet_getName(match, &status); 502 if (strcmp(name, "IBM424_ltr") != 0) { 503 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name); 504 } 505 506bail: 507 freeBytes(bytes); 508 freeBytes(bytes_r); 509 ucsdet_close(csd); 510} 511 512static void TestIBM420(void) 513{ 514 UErrorCode status = U_ZERO_ERROR; 515 516 static const UChar chars[] = { 517 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 518 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 519 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 520 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 521 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 522 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 523 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 524 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 525 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 526 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 527 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 528 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 529 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 530 0x0000 531 }; 532 static const UChar chars_reverse[] = { 533 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 534 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 535 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 536 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 537 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 538 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 539 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 540 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 541 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 542 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 543 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 544 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 545 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 546 0x0000, 547 }; 548 549 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse); 550 551 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); 552 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength); 553 554 UCharsetDetector *csd = ucsdet_open(&status); 555 const UCharsetMatch *match; 556 const char *name; 557 558 ucsdet_setText(csd, bytes, bLength, &status); 559 match = ucsdet_detect(csd, &status); 560 561 if (match == NULL) { 562 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n"); 563 goto bail; 564 } 565 566 name = ucsdet_getName(match, &status); 567 if (strcmp(name, "IBM420_rtl") != 0) { 568 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name); 569 } 570 571 ucsdet_setText(csd, bytes_r, brLength, &status); 572 match = ucsdet_detect(csd, &status); 573 574 if (match == NULL) { 575 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n"); 576 goto bail; 577 } 578 579 name = ucsdet_getName(match, &status); 580 if (strcmp(name, "IBM420_ltr") != 0) { 581 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name); 582 } 583 584bail: 585 freeBytes(bytes); 586 freeBytes(bytes_r); 587 ucsdet_close(csd); 588} 589