16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org **********************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *   Copyright (C) 2005-2013, International Business Machines
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *   Corporation and others.  All Rights Reserved.
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org **********************************************************************
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucsdet.h"
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucnv.h"
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/unistr.h"
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/putil.h"
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h"
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "intltest.h"
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "csdetest.h"
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "xmlparser.h"
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdlib.h>
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <string.h>
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdio.h>
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define CH_SPACE 0x0020
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define CH_SLASH 0x002F
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define TEST_ASSERT(x) {if (!(x)) { \
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return;}}
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//---------------------------------------------------------------------------
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//  Test class boilerplate
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//---------------------------------------------------------------------------
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCharsetDetectionTest::CharsetDetectionTest()
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCharsetDetectionTest::~CharsetDetectionTest()
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (exec) logln("TestSuite CharsetDetectionTest: ");
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    switch (index) {
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 0: name = "ConstructionTest";
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) ConstructionTest();
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 1: name = "UTF8Test";
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) UTF8Test();
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 2: name = "UTF16Test";
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) UTF16Test();
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 3: name = "C1BytesTest";
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) C1BytesTest();
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 4: name = "InputFilterTest";
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) InputFilterTest();
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 5: name = "DetectionTest";
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) DetectionTest();
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_LEGACY_CONVERSION
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 6: name = "IBM424Test";
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) IBM424Test();
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 7: name = "IBM420Test";
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) IBM420Test();
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#else
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 6:
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 7: name = "skip"; break;
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 8: name = "Ticket6394Test";
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) Ticket6394Test();
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org       case 9: name = "Ticket6954Test";
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (exec) Ticket6954Test();
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        default: name = "";
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break; //needed to end loop
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t offset = -1;
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    splits = 1;
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        splits += 1;
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString *result = new UnicodeString[splits];
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t start = 0;
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t split = 0;
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t end;
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while((end = src.indexOf(ch, start)) >= 0) {
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        src.extractBetween(start, end, result[split++]);
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        start = end + 1;
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    src.extractBetween(start, src.length(), result[split]);
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return result;
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t sLength = source.length();
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes = NULL;
1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    length = source.extract(0, sLength, NULL, codepage);
1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (length > 0) {
1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        bytes = NEW_ARRAY(char, length + 1);
1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        source.extract(0, sLength, bytes, codepage);
1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return bytes;
1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic void freeBytes(char *bytes)
1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    DELETE_ARRAY(bytes);
1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t splits = 0;
1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t testLength = testString.length();
1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t cpLength = eSplit[0].length();
1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char codepage[64];
1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    codepage[cpLength] = '\0';
1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t byteLength = 0;
1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes = extractBytes(testString, codepage, byteLength);
1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (bytes == NULL) {
1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_LEGACY_CONVERSION
1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        dataerrln("Can't open a " + encoding + " converter for " + id);
1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t matchCount = 0;
1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString name(ucsdet_getName(matches[0], &status));
1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar *decoded = NULL;
1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t dLength = 0;
1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (matchCount == 0) {
1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (name.compare(eSplit[0]) != 0) {
1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT
2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (int32_t m = 0; m < matchCount; m += 1) {
2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            const char *name = ucsdet_getName(matches[m], &status);
2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            const char *lang = ucsdet_getLanguage(matches[m], &status);
2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            int32_t confidence = ucsdet_getConfidence(matches[m], &status);
2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            printf("%s (%s) %d\n", name, lang, confidence);
2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    decoded = NEW_ARRAY(UChar, testLength);
2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (testString.compare(decoded, dLength) != 0) {
2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT
2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for(int32_t i = 0; i < testLength; i += 1) {
2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if(testString[i] != decoded[i]) {
2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                printf("Strings differ at byte %d\n", i);
2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                break;
2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    DELETE_ARRAY(decoded);
2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes);
2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    delete[] eSplit;
2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("ERROR: getPath() failed - %s", u_errorName(status));
2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return NULL;
2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    strcpy(buffer, testDataDirectory);
2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    strcat(buffer, filename);
2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return buffer;
2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::ConstructionTest()
2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    IcuTestErrorCode status(*this, "ConstructionTest");
2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    LocalUCharsetDetectorPointer csd(ucsdet_open(status));
2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t count = uenum_count(e.getAlias(), status);
2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT
2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    printf("There are %d recognizers.\n", count);
2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    for(int32_t i = 0; i < count; i += 1) {
2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int32_t length;
2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        const char *name = uenum_next(e.getAlias(), &length, status);
2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(name == NULL || length <= 0) {
2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifdef DEBUG_DETECT
2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        printf("%s\n", name);
2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char* defDisabled[] = {
2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "IBM420_rtl", "IBM420_ltr",
2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "IBM424_rtl", "IBM424_ltr",
2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0
2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    };
2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *activeName = NULL;
2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // the charset must be included in all list
2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UBool found = FALSE;
2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        const char *name = NULL;
2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        uenum_reset(e.getAlias(), status);
2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        while ((name = uenum_next(e.getAlias(), NULL, status))) {
2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (strcmp(activeName, name) == 0) {
2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                found = TRUE;
2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                break;
2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (!found) {
3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln(UnicodeString(activeName) + " is not included in the all charset list.");
3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // some charsets are disabled by default
3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        found = FALSE;
3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (int32_t i = 0; defDisabled[i] != 0; i++) {
3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (strcmp(activeName, defDisabled[i]) == 0) {
3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                found = TRUE;
3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                break;
3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (found) {
3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln(UnicodeString(activeName) + " should not be included in the default charset list.");
3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::UTF8Test()
3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString ss = "This is a string with some non-ascii characters that will "
3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                       "be converted to UTF-8, then shoved through the detection process.  "
3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                       "Sure would be nice if our source could contain Unicode directly!";
3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s = ss.unescape();
3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t byteLength = 0, sLength = s.length();
3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes = extractBytes(s, "UTF-8", byteLength);
3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match;
3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar *detected = NEW_ARRAY(UChar, sLength);
3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes, byteLength, &status);
3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Detection failure for UTF-8: got no matches.");
3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_getUChars(match, detected, sLength, &status);
3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (s.compare(detected, sLength) != 0) {
3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Round-trip test failed!");
3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    DELETE_ARRAY(detected);
3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes);
3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::UTF16Test()
3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /* Notice the BOM on the start of this string */
3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar chars[] = {
3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x064a, 0x062a, 0x0000};
3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s(chars);
3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t beLength = 0, leLength = 0;
3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match;
3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name;
3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t conf;
3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, beBytes, beLength, &status);
3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for UTF-16BE: got no matches.");
3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto try_le;
3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    conf  = ucsdet_getConfidence(match, &status);
3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "UTF-16BE") != 0) {
3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for UTF-16BE: got %s", name);
3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto try_le; // no point in looking at confidence if we got the wrong character set.
3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (conf != 100) {
3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgtry_le:
3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, leBytes, leLength, &status);
3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for UTF-16LE: got no matches.");
4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    conf = ucsdet_getConfidence(match, &status);
4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "UTF-16LE") != 0) {
4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Enconding detection failure for UTF-16LE: got %s", name);
4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail; // no point in looking at confidence if we got the wrong character set.
4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (conf != 100) {
4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(leBytes);
4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(beBytes);
4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::InputFilterTest()
4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s  = ss.unescape();
4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t byteLength = 0;
4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match;
4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *lang, *name;
4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_enableInputFilter(csd, TRUE);
4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (!ucsdet_isInputFilterEnabled(csd)) {
4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes, byteLength, &status);
4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Turning on the input filter resulted in no matches.");
4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto turn_off;
4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name = ucsdet_getName(match, &status);
4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        lang = ucsdet_getLanguage(match, &status);
4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (lang == NULL || strcmp(lang, "fr") != 0) {
4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln("Input filter did not strip markup!");
4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgturn_off:
4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_enableInputFilter(csd, FALSE);
4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes, byteLength, &status);
4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Turning off the input filter resulted in no matches.");
4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name = ucsdet_getName(match, &status);
4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        lang = ucsdet_getLanguage(match, &status);
4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (lang == NULL || strcmp(lang, "en") != 0) {
4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln("Unfiltered input did not detect as English!");
4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes);
4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::C1BytesTest()
4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_LEGACY_CONVERSION
4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString sWindows  = ssWindows.unescape();
4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t lISO = 0, lWindows = 0;
4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match;
4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name;
5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bWindows, lWindows, &status);
5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "windows-1252") != 0) {
5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bISO, lISO, &status);
5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("English text without C1 bytes got no matches.");
5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "ISO-8859-1") != 0) {
5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bWindows);
5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bISO);
5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::DetectionTest()
5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_REGULAR_EXPRESSIONS
5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char path[2048];
5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *testFilePath = getPath(path, "csdetest.xml");
5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (testFilePath == NULL) {
5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return; /* Couldn't get path: error message already output. */
5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UXMLParser  *parser = UXMLParser::createParser(status);
5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UXMLElement *root   = parser->parseFile(testFilePath, status);
5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (!assertSuccess( "parseFile",status)) return;
5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UXMLElement *testCase;
5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t tc = 0;
5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while((testCase = root->nextChildElement(tc)) != NULL) {
5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (testCase->getTagName().compare(test_case) == 0) {
5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            const UnicodeString *id = testCase->getAttribute(id_attr);
5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            const UnicodeString  text = testCase->getText(TRUE);
5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            int32_t encodingCount;
5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            for(int32_t e = 0; e < encodingCount; e += 1) {
5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                checkEncoding(text, encodingList[e], *id);
5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            delete[] encodingList;
5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    delete root;
5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    delete parser;
5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::IBM424Test()
5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static const UChar chars[] = {
5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    };
6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static const UChar chars_reverse[] = {
6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
6246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            0x0000
6286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    };
6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t bLength = 0, brLength = 0;
6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s1(chars);
6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s2(chars_reverse);
6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes = extractBytes(s1, "IBM424", bLength);
6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes_r = extractBytes(s2, "IBM424", brLength);
6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Error opening charset detector. - %s", u_errorName(status));
6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match;
6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name;
6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes, bLength, &status);
6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "IBM424_rtl") != 0) {
6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for IBM424_rtl: got %s", name);
6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes_r, brLength, &status);
6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for IBM424_ltr: got no matches.");
6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "IBM424_ltr") != 0) {
6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for IBM424_ltr: got %s", name);
6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes);
6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes_r);
6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::IBM420Test()
6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static const UChar chars[] = {
6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0000
7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    };
7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static const UChar chars_reverse[] = {
7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        0x0000,
7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    };
7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t bLength = 0, brLength = 0;
7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s1(chars);
7216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s2(chars_reverse);
7226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes = extractBytes(s1, "IBM420", bLength);
7246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bytes_r = extractBytes(s2, "IBM420", brLength);
7256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
7276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
7286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Error opening charset detector. - %s", u_errorName(status));
7296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
7316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
7326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
7336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
7346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match;
7356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name;
7366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes, bLength, &status);
7386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
7396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
7416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
7426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
7436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
7466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "IBM420_rtl") != 0) {
7476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
7486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, bytes_r, brLength, &status);
7516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    match = ucsdet_detect(csd, &status);
7526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (match == NULL) {
7546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
7556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto bail;
7566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name  = ucsdet_getName(match, &status);
7596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (strcmp(name, "IBM420_ltr") != 0) {
7606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
7616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbail:
7646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes);
7656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bytes_r);
7666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
7676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
7686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::Ticket6394Test() {
7716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_CONVERSION
7726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
7736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
7746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                             "encodings more than once.  The hop through UnicodeString is for platforms "
7756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
7766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char latin1Text[sizeof(charText)];
7776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
7786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
7806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd = ucsdet_open(&status);
7816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd, latin1Text, -1, &status);
7826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
7836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
7846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
7856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t matchCount = 0;
7886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
7896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
7906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
7916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
7926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
7936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
7946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
7956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t i;
7966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    for (i=0; i<matchCount; i++) {
7976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
7986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (U_FAILURE(status)) {
7996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
8006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            status = U_ZERO_ERROR;
8016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
8026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (setOfCharsetNames.contains(charSetName)) {
8036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
8046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
8056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
8066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        setOfCharsetNames.add(charSetName);
8076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
8086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd);
8096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
8106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
8116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
8146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//               similar Windows and non-Windows SBCS encodings. State was kept in the shared
8156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//               Charset Recognizer objects, and could be overwritten.
8166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid CharsetDetectionTest::Ticket6954Test() {
8176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
8186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
8196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
8206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
8216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                            "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
8226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString sWindows  = ssWindows.unescape();
8236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t lISO = 0, lWindows = 0;
8246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
8256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
8266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // First do a plain vanilla detect of 1252 text
8286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd1 = ucsdet_open(&status);
8306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd1, bWindows, lWindows, &status);
8316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
8326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name1 = ucsdet_getName(match1, &status);
8336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    TEST_ASSERT_SUCCESS(status);
8346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
8356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Next, using a completely separate detector, detect some 8859-1 text
8376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UCharsetDetector *csd2 = ucsdet_open(&status);
8396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_setText(csd2, bISO, lISO, &status);
8406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
8416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name2 = ucsdet_getName(match2, &status);
8426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    TEST_ASSERT_SUCCESS(status);
8436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
8446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Recheck the 1252 results from the first detector, which should not have been
8466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //  altered by the use of a different detector.
8476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    name1 = ucsdet_getName(match1, &status);
8496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    TEST_ASSERT_SUCCESS(status);
8506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
8516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
8526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd1);
8536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ucsdet_close(csd2);
8546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bISO);
8556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    freeBytes(bWindows);
8566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
8576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
858