16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copyright (C) 2005-2013, International Business Machines 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Corporation and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucsdet.h" 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucnv.h" 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/unistr.h" 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/putil.h" 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "intltest.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "csdetest.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "xmlparser.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdlib.h> 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <string.h> 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdio.h> 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define CH_SPACE 0x0020 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define CH_SLASH 0x002F 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define TEST_ASSERT(x) {if (!(x)) { \ 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\ 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return;}} 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//--------------------------------------------------------------------------- 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Test class boilerplate 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//--------------------------------------------------------------------------- 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCharsetDetectionTest::CharsetDetectionTest() 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCharsetDetectionTest::~CharsetDetectionTest() 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) logln("TestSuite CharsetDetectionTest: "); 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org switch (index) { 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 0: name = "ConstructionTest"; 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) ConstructionTest(); 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 1: name = "UTF8Test"; 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) UTF8Test(); 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 2: name = "UTF16Test"; 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) UTF16Test(); 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 3: name = "C1BytesTest"; 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) C1BytesTest(); 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 4: name = "InputFilterTest"; 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) InputFilterTest(); 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 5: name = "DetectionTest"; 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) DetectionTest(); 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_LEGACY_CONVERSION 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 6: name = "IBM424Test"; 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) IBM424Test(); 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 7: name = "IBM420Test"; 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) IBM420Test(); 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#else 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 6: 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 7: name = "skip"; break; 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 8: name = "Ticket6394Test"; 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) Ticket6394Test(); 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 9: name = "Ticket6954Test"; 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (exec) Ticket6954Test(); 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org default: name = ""; 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; //needed to end loop 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t offset = -1; 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org splits = 1; 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while((offset = src.indexOf(ch, offset + 1)) >= 0) { 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org splits += 1; 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *result = new UnicodeString[splits]; 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start = 0; 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t split = 0; 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t end; 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while((end = src.indexOf(ch, start)) >= 0) { 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org src.extractBetween(start, end, result[split++]); 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start = end + 1; 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org src.extractBetween(start, src.length(), result[split]); 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return result; 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t sLength = source.length(); 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes = NULL; 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length = source.extract(0, sLength, NULL, codepage); 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (length > 0) { 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bytes = NEW_ARRAY(char, length + 1); 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org source.extract(0, sLength, bytes, codepage); 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return bytes; 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic void freeBytes(char *bytes) 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DELETE_ARRAY(bytes); 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t splits = 0; 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t testLength = testString.length(); 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *eSplit = split(encoding, CH_SLASH, splits); 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cpLength = eSplit[0].length(); 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char codepage[64]; 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org codepage[cpLength] = '\0'; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t byteLength = 0; 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes = extractBytes(testString, codepage, byteLength); 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bytes == NULL) { 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_LEGACY_CONVERSION 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org dataerrln("Can't open a " + encoding + " converter for " + id); 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd.getAlias(), bytes, byteLength, &status); 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t matchCount = 0; 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status); 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString name(ucsdet_getName(matches[0], &status)); 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar *decoded = NULL; 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t dLength = 0; 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (matchCount == 0) { 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (name.compare(eSplit[0]) != 0) { 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t m = 0; m < matchCount; m += 1) { 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name = ucsdet_getName(matches[m], &status); 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *lang = ucsdet_getLanguage(matches[m], &status); 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t confidence = ucsdet_getConfidence(matches[m], &status); 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("%s (%s) %d\n", name, lang, confidence); 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (splits > 1 && lang.compare(eSplit[1]) != 0) { 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org decoded = NEW_ARRAY(UChar, testLength); 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (testString.compare(decoded, dLength) != 0) { 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i = 0; i < testLength; i += 1) { 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(testString[i] != decoded[i]) { 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("Strings differ at byte %d\n", i); 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DELETE_ARRAY(decoded); 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes); 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete[] eSplit; 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *testDataDirectory = IntlTest::getSourceTestData(status); 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("ERROR: getPath() failed - %s", u_errorName(status)); 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org strcpy(buffer, testDataDirectory); 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org strcat(buffer, filename); 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return buffer; 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::ConstructionTest() 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuTestErrorCode status(*this, "ConstructionTest"); 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org LocalUCharsetDetectorPointer csd(ucsdet_open(status)); 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status)); 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count = uenum_count(e.getAlias(), status); 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("There are %d recognizers.\n", count); 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i = 0; i < count; i += 1) { 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length; 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name = uenum_next(e.getAlias(), &length, status); 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(name == NULL || length <= 0) { 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("%s\n", name); 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char* defDisabled[] = { 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "IBM420_rtl", "IBM420_ltr", 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "IBM424_rtl", "IBM424_ltr", 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status)); 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *activeName = NULL; 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) { 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the charset must be included in all list 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool found = FALSE; 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name = NULL; 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uenum_reset(e.getAlias(), status); 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while ((name = uenum_next(e.getAlias(), NULL, status))) { 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(activeName, name) == 0) { 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org found = TRUE; 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!found) { 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln(UnicodeString(activeName) + " is not included in the all charset list."); 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // some charsets are disabled by default 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org found = FALSE; 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i = 0; defDisabled[i] != 0; i++) { 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(activeName, defDisabled[i]) == 0) { 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org found = TRUE; 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (found) { 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln(UnicodeString(activeName) + " should not be included in the default charset list."); 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::UTF8Test() 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString ss = "This is a string with some non-ascii characters that will " 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "be converted to UTF-8, then shoved through the detection process. " 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "\\u0391\\u0392\\u0393\\u0394\\u0395" 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "Sure would be nice if our source could contain Unicode directly!"; 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s = ss.unescape(); 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t byteLength = 0, sLength = s.length(); 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes = extractBytes(s, "UTF-8", byteLength); 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match; 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar *detected = NEW_ARRAY(UChar, sLength); 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes, byteLength, &status); 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Detection failure for UTF-8: got no matches."); 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_getUChars(match, detected, sLength, &status); 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (s.compare(detected, sLength) != 0) { 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Round-trip test failed!"); 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DELETE_ARRAY(detected); 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes); 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::UTF16Test() 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Notice the BOM on the start of this string */ 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar chars[] = { 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x064a, 0x062a, 0x0000}; 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s(chars); 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t beLength = 0, leLength = 0; 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *beBytes = extractBytes(s, "UTF-16BE", beLength); 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *leBytes = extractBytes(s, "UTF-16LE", leLength); 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match; 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name; 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t conf; 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, beBytes, beLength, &status); 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for UTF-16BE: got no matches."); 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto try_le; 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org conf = ucsdet_getConfidence(match, &status); 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "UTF-16BE") != 0) { 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for UTF-16BE: got %s", name); 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto try_le; // no point in looking at confidence if we got the wrong character set. 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (conf != 100) { 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgtry_le: 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, leBytes, leLength, &status); 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for UTF-16LE: got no matches."); 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org conf = ucsdet_getConfidence(match, &status); 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "UTF-16LE") != 0) { 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Enconding detection failure for UTF-16LE: got %s", name); 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; // no point in looking at confidence if we got the wrong character set. 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (conf != 100) { 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(leBytes); 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(beBytes); 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::InputFilterTest() 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s = ss.unescape(); 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t byteLength = 0; 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes = extractBytes(s, "ISO-8859-1", byteLength); 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match; 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *lang, *name; 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_enableInputFilter(csd, TRUE); 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!ucsdet_isInputFilterEnabled(csd)) { 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes, byteLength, &status); 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Turning on the input filter resulted in no matches."); 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto turn_off; 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lang = ucsdet_getLanguage(match, &status); 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (lang == NULL || strcmp(lang, "fr") != 0) { 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Input filter did not strip markup!"); 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgturn_off: 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_enableInputFilter(csd, FALSE); 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes, byteLength, &status); 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Turning off the input filter resulted in no matches."); 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lang = ucsdet_getLanguage(match, &status); 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (lang == NULL || strcmp(lang, "en") != 0) { 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Unfiltered input did not detect as English!"); 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes); 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::C1BytesTest() 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_LEGACY_CONVERSION 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString sWindows = ssWindows.unescape(); 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lISO = 0, lWindows = 0; 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match; 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name; 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bWindows, lWindows, &status); 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status)); 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "windows-1252") != 0) { 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bISO, lISO, &status); 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("English text without C1 bytes got no matches."); 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "ISO-8859-1") != 0) { 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bWindows); 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bISO); 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::DetectionTest() 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_REGULAR_EXPRESSIONS 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char path[2048]; 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *testFilePath = getPath(path, "csdetest.xml"); 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (testFilePath == NULL) { 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; /* Couldn't get path: error message already output. */ 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UXMLParser *parser = UXMLParser::createParser(status); 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UXMLElement *root = parser->parseFile(testFilePath, status); 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!assertSuccess( "parseFile",status)) return; 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); 5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); 5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); 5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UXMLElement *testCase; 5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t tc = 0; 5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while((testCase = root->nextChildElement(tc)) != NULL) { 5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (testCase->getTagName().compare(test_case) == 0) { 5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString *id = testCase->getAttribute(id_attr); 5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString *encodings = testCase->getAttribute(enc_attr); 5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString text = testCase->getText(TRUE); 5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t encodingCount; 5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); 5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t e = 0; e < encodingCount; e += 1) { 5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org checkEncoding(text, encodingList[e], *id); 5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete[] encodingList; 5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete root; 5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete parser; 5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::IBM424Test() 5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const UChar chars[] = { 5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const UChar chars_reverse[] = { 6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 6246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0000 6286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t bLength = 0, brLength = 0; 6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s1(chars); 6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s2(chars_reverse); 6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes = extractBytes(s1, "IBM424", bLength); 6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes_r = extractBytes(s2, "IBM424", brLength); 6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); 6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); 6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); 6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); 6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Error opening charset detector. - %s", u_errorName(status)); 6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match; 6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name; 6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes, bLength, &status); 6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status)); 6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "IBM424_rtl") != 0) { 6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for IBM424_rtl: got %s", name); 6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes_r, brLength, &status); 6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for IBM424_ltr: got no matches."); 6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "IBM424_ltr") != 0) { 6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for IBM424_ltr: got %s", name); 6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes); 6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes_r); 6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::IBM420Test() 6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const UChar chars[] = { 6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0000 7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const UChar chars_reverse[] = { 7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 0x0000, 7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t bLength = 0, brLength = 0; 7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s1(chars); 7216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s2(chars_reverse); 7226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes = extractBytes(s1, "IBM420", bLength); 7246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bytes_r = extractBytes(s2, "IBM420", brLength); 7256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 7276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 7286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Error opening charset detector. - %s", u_errorName(status)); 7296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); 7316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); 7326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); 7336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); 7346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match; 7356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name; 7366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes, bLength, &status); 7386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 7396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 7416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status)); 7426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 7436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 7466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "IBM420_rtl") != 0) { 7476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for IBM420_rtl: got %s\n", name); 7486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, bytes_r, brLength, &status); 7516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org match = ucsdet_detect(csd, &status); 7526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (match == NULL) { 7546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); 7556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto bail; 7566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name = ucsdet_getName(match, &status); 7596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (strcmp(name, "IBM420_ltr") != 0) { 7606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Encoding detection failure for IBM420_ltr: got %s\n", name); 7616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail: 7646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes); 7656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bytes_r); 7666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 7676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 7686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::Ticket6394Test() { 7716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_CONVERSION 7726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char charText[] = "Here is some random English text that should be detected as ISO-8859-1." 7736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected " 7746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "encodings more than once. The hop through UnicodeString is for platforms " 7756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "where this char * string is be EBCDIC and needs conversion to Latin1."; 7766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char latin1Text[sizeof(charText)]; 7776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1"); 7786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 7806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd = ucsdet_open(&status); 7816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd, latin1Text, -1, &status); 7826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 7836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 7846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 7856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t matchCount = 0; 7886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); 7896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 7906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 7916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 7926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. 7956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i; 7966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i=0; i<matchCount; i++) { 7976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString charSetName(ucsdet_getName(matches[i], &status)); 7986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 7996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i); 8006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ZERO_ERROR; 8016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (setOfCharsetNames.contains(charSetName)) { 8036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln("Fail at file %s, line %d ", __FILE__, __LINE__); 8046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org errln(UnicodeString(" Duplicate charset name = ") + charSetName); 8056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setOfCharsetNames.add(charSetName); 8076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd); 8096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 8106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 8116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between 8146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// similar Windows and non-Windows SBCS encodings. State was kept in the shared 8156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Charset Recognizer objects, and could be overwritten. 8166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::Ticket6954Test() { 8176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING 8186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 8196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 8206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly." 8216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); 8226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString sWindows = ssWindows.unescape(); 8236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lISO = 0, lWindows = 0; 8246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); 8256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); 8266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // First do a plain vanilla detect of 1252 text 8286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd1 = ucsdet_open(&status); 8306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd1, bWindows, lWindows, &status); 8316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match1 = ucsdet_detect(csd1, &status); 8326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name1 = ucsdet_getName(match1, &status); 8336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TEST_ASSERT_SUCCESS(status); 8346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TEST_ASSERT(strcmp(name1, "windows-1252")==0); 8356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Next, using a completely separate detector, detect some 8859-1 text 8376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsetDetector *csd2 = ucsdet_open(&status); 8396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_setText(csd2, bISO, lISO, &status); 8406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UCharsetMatch *match2 = ucsdet_detect(csd2, &status); 8416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name2 = ucsdet_getName(match2, &status); 8426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TEST_ASSERT_SUCCESS(status); 8436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0); 8446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Recheck the 1252 results from the first detector, which should not have been 8466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // altered by the use of a different detector. 8476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name1 = ucsdet_getName(match1, &status); 8496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TEST_ASSERT_SUCCESS(status); 8506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TEST_ASSERT(strcmp(name1, "windows-1252")==0); 8516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd1); 8536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucsdet_close(csd2); 8546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bISO); 8556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org freeBytes(bWindows); 8566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 8576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 858