1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/*
2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru **********************************************************************
31b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert *   Copyright (C) 2005-2015, International Business Machines
4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *   Corporation and others.  All Rights Reserved.
5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru **********************************************************************
6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h"
10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucsdet.h"
11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucnv.h"
12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/unistr.h"
13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/putil.h"
14b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uniset.h"
15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "intltest.h"
17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "csdetest.h"
18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "xmlparser.h"
20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdlib.h>
22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <string.h>
23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT
25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdio.h>
26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define CH_SPACE 0x0020
34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define CH_SLASH 0x002F
35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
3654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#define TEST_ASSERT(x) {if (!(x)) { \
3754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
3854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
3954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
4054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
4154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    return;}}
4254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
4354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------
45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//
46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//  Test class boilerplate
47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//
48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------
49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCharsetDetectionTest::CharsetDetectionTest()
50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruCharsetDetectionTest::~CharsetDetectionTest()
55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (exec) logln("TestSuite CharsetDetectionTest: ");
63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    switch (index) {
64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru       case 0: name = "ConstructionTest";
65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if (exec) ConstructionTest();
66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break;
67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru       case 1: name = "UTF8Test";
69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if (exec) UTF8Test();
70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break;
71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru       case 2: name = "UTF16Test";
73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if (exec) UTF16Test();
74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break;
75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru       case 3: name = "C1BytesTest";
77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if (exec) C1BytesTest();
78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break;
79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru       case 4: name = "InputFilterTest";
81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if (exec) InputFilterTest();
82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break;
83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru       case 5: name = "DetectionTest";
85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if (exec) DetectionTest();
86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break;
8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_LEGACY_CONVERSION
88b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru       case 6: name = "IBM424Test";
89b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (exec) IBM424Test();
90b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            break;
91b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
92b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru       case 7: name = "IBM420Test";
93b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (exec) IBM420Test();
94b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            break;
9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else
9650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho       case 6:
9750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho       case 7: name = "skip"; break;
9850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif
99b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru       case 8: name = "Ticket6394Test";
100b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (exec) Ticket6394Test();
101b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            break;
102b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
10354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius       case 9: name = "Ticket6954Test";
10454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            if (exec) Ticket6954Test();
10554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            break;
10654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        default: name = "";
108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            break; //needed to end loop
109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t offset = -1;
115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    splits = 1;
117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        splits += 1;
119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString *result = new UnicodeString[splits];
122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t start = 0;
124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t split = 0;
125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t end;
126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    while((end = src.indexOf(ch, start)) >= 0) {
128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        src.extractBetween(start, end, result[split++]);
129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        start = end + 1;
130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    src.extractBetween(start, src.length(), result[split]);
133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    return result;
135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t sLength = source.length();
140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *bytes = NULL;
141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    length = source.extract(0, sLength, NULL, codepage);
143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (length > 0) {
145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        bytes = NEW_ARRAY(char, length + 1);
146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        source.extract(0, sLength, bytes, codepage);
147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    return bytes;
150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic void freeBytes(char *bytes)
153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    DELETE_ARRAY(bytes);
155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t splits = 0;
160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t testLength = testString.length();
161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t cpLength = eSplit[0].length();
164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char codepage[64];
165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    codepage[cpLength] = '\0';
168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t byteLength = 0;
172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *bytes = extractBytes(testString, codepage, byteLength);
173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (bytes == NULL) {
175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_LEGACY_CONVERSION
176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        dataerrln("Can't open a " + encoding + " converter for " + id);
177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        return;
179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
18150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t matchCount = 0;
18450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString name(ucsdet_getName(matches[0], &status));
188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UChar *decoded = NULL;
190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t dLength = 0;
191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (matchCount == 0) {
193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (name.compare(eSplit[0]) != 0) {
198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT
201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        for (int32_t m = 0; m < matchCount; m += 1) {
202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            const char *name = ucsdet_getName(matches[m], &status);
203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            const char *lang = ucsdet_getLanguage(matches[m], &status);
204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            printf("%s (%s) %d\n", name, lang, confidence);
207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    decoded = NEW_ARRAY(UChar, testLength);
218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (testString.compare(decoded, dLength) != 0) {
221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT
224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        for(int32_t i = 0; i < testLength; i += 1) {
225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            if(testString[i] != decoded[i]) {
226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                printf("Strings differ at byte %d\n", i);
227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                break;
228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            }
229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    DELETE_ARRAY(decoded);
235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail:
237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(bytes);
238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    delete[] eSplit;
239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruconst char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const char *testDataDirectory = IntlTest::getSourceTestData(status);
244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (U_FAILURE(status)) {
246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("ERROR: getPath() failed - %s", u_errorName(status));
247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        return NULL;
248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    strcpy(buffer, testDataDirectory);
251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    strcat(buffer, filename);
252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    return buffer;
253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::ConstructionTest()
256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
25750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    IcuTestErrorCode status(*this, "ConstructionTest");
25850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    LocalUCharsetDetectorPointer csd(ucsdet_open(status));
25950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
26050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t count = uenum_count(e.getAlias(), status);
261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT
263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    printf("There are %d recognizers.\n", count);
264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    for(int32_t i = 0; i < count; i += 1) {
267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        int32_t length;
26850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        const char *name = uenum_next(e.getAlias(), &length, status);
269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        if(name == NULL || length <= 0) {
271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifdef DEBUG_DETECT
275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        printf("%s\n", name);
276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
27859d709d503bab6e2b61931737e662dd293b40578ccornelius
27959d709d503bab6e2b61931737e662dd293b40578ccornelius    const char* defDisabled[] = {
28059d709d503bab6e2b61931737e662dd293b40578ccornelius        "IBM420_rtl", "IBM420_ltr",
28159d709d503bab6e2b61931737e662dd293b40578ccornelius        "IBM424_rtl", "IBM424_ltr",
28259d709d503bab6e2b61931737e662dd293b40578ccornelius        0
28359d709d503bab6e2b61931737e662dd293b40578ccornelius    };
28459d709d503bab6e2b61931737e662dd293b40578ccornelius
28559d709d503bab6e2b61931737e662dd293b40578ccornelius    LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
28659d709d503bab6e2b61931737e662dd293b40578ccornelius    const char *activeName = NULL;
28759d709d503bab6e2b61931737e662dd293b40578ccornelius
28859d709d503bab6e2b61931737e662dd293b40578ccornelius    while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
28959d709d503bab6e2b61931737e662dd293b40578ccornelius        // the charset must be included in all list
29059d709d503bab6e2b61931737e662dd293b40578ccornelius        UBool found = FALSE;
29159d709d503bab6e2b61931737e662dd293b40578ccornelius
29259d709d503bab6e2b61931737e662dd293b40578ccornelius        const char *name = NULL;
29359d709d503bab6e2b61931737e662dd293b40578ccornelius        uenum_reset(e.getAlias(), status);
29459d709d503bab6e2b61931737e662dd293b40578ccornelius        while ((name = uenum_next(e.getAlias(), NULL, status))) {
29559d709d503bab6e2b61931737e662dd293b40578ccornelius            if (strcmp(activeName, name) == 0) {
29659d709d503bab6e2b61931737e662dd293b40578ccornelius                found = TRUE;
29759d709d503bab6e2b61931737e662dd293b40578ccornelius                break;
29859d709d503bab6e2b61931737e662dd293b40578ccornelius            }
29959d709d503bab6e2b61931737e662dd293b40578ccornelius        }
30059d709d503bab6e2b61931737e662dd293b40578ccornelius
30159d709d503bab6e2b61931737e662dd293b40578ccornelius        if (!found) {
30259d709d503bab6e2b61931737e662dd293b40578ccornelius            errln(UnicodeString(activeName) + " is not included in the all charset list.");
30359d709d503bab6e2b61931737e662dd293b40578ccornelius        }
30459d709d503bab6e2b61931737e662dd293b40578ccornelius
30559d709d503bab6e2b61931737e662dd293b40578ccornelius        // some charsets are disabled by default
30659d709d503bab6e2b61931737e662dd293b40578ccornelius        found = FALSE;
30759d709d503bab6e2b61931737e662dd293b40578ccornelius        for (int32_t i = 0; defDisabled[i] != 0; i++) {
30859d709d503bab6e2b61931737e662dd293b40578ccornelius            if (strcmp(activeName, defDisabled[i]) == 0) {
30959d709d503bab6e2b61931737e662dd293b40578ccornelius                found = TRUE;
31059d709d503bab6e2b61931737e662dd293b40578ccornelius                break;
31159d709d503bab6e2b61931737e662dd293b40578ccornelius            }
31259d709d503bab6e2b61931737e662dd293b40578ccornelius        }
31359d709d503bab6e2b61931737e662dd293b40578ccornelius        if (found) {
31459d709d503bab6e2b61931737e662dd293b40578ccornelius            errln(UnicodeString(activeName) + " should not be included in the default charset list.");
31559d709d503bab6e2b61931737e662dd293b40578ccornelius        }
31659d709d503bab6e2b61931737e662dd293b40578ccornelius    }
317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::UTF8Test()
320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString ss = "This is a string with some non-ascii characters that will "
323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                       "be converted to UTF-8, then shoved through the detection process.  "
324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                       "Sure would be nice if our source could contain Unicode directly!";
326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString s = ss.unescape();
327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t byteLength = 0, sLength = s.length();
328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *bytes = extractBytes(s, "UTF-8", byteLength);
329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UCharsetMatch *match;
331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UChar *detected = NEW_ARRAY(UChar, sLength);
332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, bytes, byteLength, &status);
334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Detection failure for UTF-8: got no matches.");
338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_getUChars(match, detected, sLength, &status);
342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (s.compare(detected, sLength) != 0) {
344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Round-trip test failed!");
345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail:
350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    DELETE_ARRAY(detected);
351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(bytes);
352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_close(csd);
353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::UTF16Test()
356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    /* Notice the BOM on the start of this string */
359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UChar chars[] = {
360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        0x064a, 0x062a, 0x0000};
365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString s(chars);
366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t beLength = 0, leLength = 0;
367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UCharsetMatch *match;
371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const char *name;
372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t conf;
373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, beBytes, beLength, &status);
375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Encoding detection failure for UTF-16BE: got no matches.");
379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto try_le;
380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    name  = ucsdet_getName(match, &status);
383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    conf  = ucsdet_getConfidence(match, &status);
384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (strcmp(name, "UTF-16BE") != 0) {
386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Encoding detection failure for UTF-16BE: got %s", name);
387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto try_le; // no point in looking at confidence if we got the wrong character set.
388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (conf != 100) {
391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querutry_le:
395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, leBytes, leLength, &status);
396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Encoding detection failure for UTF-16LE: got no matches.");
400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    name  = ucsdet_getName(match, &status);
404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    conf = ucsdet_getConfidence(match, &status);
405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (strcmp(name, "UTF-16LE") != 0) {
408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Enconding detection failure for UTF-16LE: got %s", name);
409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail; // no point in looking at confidence if we got the wrong character set.
410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (conf != 100) {
413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail:
417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(leBytes);
418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(beBytes);
419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_close(csd);
420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::InputFilterTest()
423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString s  = ss.unescape();
427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t byteLength = 0;
428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UCharsetMatch *match;
431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const char *lang, *name;
432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_enableInputFilter(csd, TRUE);
434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (!ucsdet_isInputFilterEnabled(csd)) {
436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, bytes, byteLength, &status);
441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Turning on the input filter resulted in no matches.");
445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto turn_off;
446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    name = ucsdet_getName(match, &status);
449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    } else {
453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        lang = ucsdet_getLanguage(match, &status);
454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        if (lang == NULL || strcmp(lang, "fr") != 0) {
456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            errln("Input filter did not strip markup!");
457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruturn_off:
461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_enableInputFilter(csd, FALSE);
462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, bytes, byteLength, &status);
463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Turning off the input filter resulted in no matches.");
467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    name = ucsdet_getName(match, &status);
471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    } else {
475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        lang = ucsdet_getLanguage(match, &status);
476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        if (lang == NULL || strcmp(lang, "en") != 0) {
478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            errln("Unfiltered input did not detect as English!");
479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail:
483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(bytes);
484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_close(csd);
485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::C1BytesTest()
488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_LEGACY_CONVERSION
490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString sWindows  = ssWindows.unescape();
494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t lISO = 0, lWindows = 0;
495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UCharsetMatch *match;
499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const char *name;
500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, bWindows, lWindows, &status);
502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
5056d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    name  = ucsdet_getName(match, &status);
510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (strcmp(name, "windows-1252") != 0) {
512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_setText(csd, bISO, lISO, &status);
516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    match = ucsdet_detect(csd, &status);
517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (match == NULL) {
519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("English text without C1 bytes got no matches.");
520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        goto bail;
521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    name  = ucsdet_getName(match, &status);
524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (strcmp(name, "ISO-8859-1") != 0) {
526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querubail:
530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(bWindows);
531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    freeBytes(bISO);
532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    ucsdet_close(csd);
534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid CharsetDetectionTest::DetectionTest()
538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{
539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS
540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    char path[2048];
542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const char *testFilePath = getPath(path, "csdetest.xml");
543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (testFilePath == NULL) {
545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        return; /* Couldn't get path: error message already output. */
546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UXMLParser  *parser = UXMLParser::createParser(status);
5496d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru    if (U_FAILURE(status)) {
5506d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
5516d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        return;
5526d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru    }
5536d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru
554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UXMLElement *root   = parser->parseFile(testFilePath, status);
555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if (!assertSuccess( "parseFile",status)) return;
556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    const UXMLElement *testCase;
562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t tc = 0;
563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    while((testCase = root->nextChildElement(tc)) != NULL) {
565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        if (testCase->getTagName().compare(test_case) == 0) {
566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            const UnicodeString *id = testCase->getAttribute(id_attr);
567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            const UnicodeString  text = testCase->getText(TRUE);
569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            int32_t encodingCount;
570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            for(int32_t e = 0; e < encodingCount; e += 1) {
573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                checkEncoding(text, encodingList[e], *id);
574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            }
575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            delete[] encodingList;
577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    delete root;
581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    delete parser;
582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif
583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
585b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid CharsetDetectionTest::IBM424Test()
586b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{
5871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#if !UCONFIG_ONLY_HTML_CONVERSION
588b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
589b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
590b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    static const UChar chars[] = {
591b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
592b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
593b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
594b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
595b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
596b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
597b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
598b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
599b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
600b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
601b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
602b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
603b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
604b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
605b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
606b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
607b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
608b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    };
609b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
610b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    static const UChar chars_reverse[] = {
611b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
612b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
613b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
614b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
615b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
616b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
617b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
618b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
619b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
620b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
621b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
622b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
623b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
624b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
625b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
626b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
627b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
628b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            0x0000
629b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    };
630b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
631b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t bLength = 0, brLength = 0;
632b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
633b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeString s1(chars);
634b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeString s2(chars_reverse);
635b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
636b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    char *bytes = extractBytes(s1, "IBM424", bLength);
637b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    char *bytes_r = extractBytes(s2, "IBM424", brLength);
638b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
639b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
64059d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
64159d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
64259d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
64359d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
6446d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru    if (U_FAILURE(status)) {
6456d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errln("Error opening charset detector. - %s", u_errorName(status));
6466d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru    }
647b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    const UCharsetMatch *match;
648b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    const char *name;
649b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
650b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_setText(csd, bytes, bLength, &status);
651b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    match = ucsdet_detect(csd, &status);
652b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
653b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (match == NULL) {
6546d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
655b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto bail;
656b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
657b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
658b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    name  = ucsdet_getName(match, &status);
659b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (strcmp(name, "IBM424_rtl") != 0) {
6606d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errln("Encoding detection failure for IBM424_rtl: got %s", name);
661b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
662b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
663b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_setText(csd, bytes_r, brLength, &status);
664b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    match = ucsdet_detect(csd, &status);
665b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
666b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (match == NULL) {
6676d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errln("Encoding detection failure for IBM424_ltr: got no matches.");
668b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto bail;
669b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
670b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
671b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    name  = ucsdet_getName(match, &status);
672b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (strcmp(name, "IBM424_ltr") != 0) {
6736d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errln("Encoding detection failure for IBM424_ltr: got %s", name);
674b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
675b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
676b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querubail:
677b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    freeBytes(bytes);
678b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    freeBytes(bytes_r);
679b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_close(csd);
6801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif
681b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
682b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
683b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid CharsetDetectionTest::IBM420Test()
684b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{
6851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#if !UCONFIG_ONLY_HTML_CONVERSION
686b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
687b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
688b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    static const UChar chars[] = {
689b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
690b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
691b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
692b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
693b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
694b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
695b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
696b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
697b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
698b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
699b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
700b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
701b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
702b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0000
703b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    };
704b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    static const UChar chars_reverse[] = {
705b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
706b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
707b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
708b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
709b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
710b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
711b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
712b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
713b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
714b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
715b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
716b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
717b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
718b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        0x0000,
719b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    };
720b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
721b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t bLength = 0, brLength = 0;
722b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
723b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeString s1(chars);
724b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeString s2(chars_reverse);
725b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
726b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    char *bytes = extractBytes(s1, "IBM420", bLength);
727b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    char *bytes_r = extractBytes(s2, "IBM420", brLength);
728b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
729b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
7306d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru    if (U_FAILURE(status)) {
7316d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errln("Error opening charset detector. - %s", u_errorName(status));
7326d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru    }
73359d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
73459d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
73559d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
73659d709d503bab6e2b61931737e662dd293b40578ccornelius	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
737b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    const UCharsetMatch *match;
738b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    const char *name;
739b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
740b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_setText(csd, bytes, bLength, &status);
741b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    match = ucsdet_detect(csd, &status);
742b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
743b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (match == NULL) {
7446d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru        errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
745b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto bail;
746b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
747b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
748b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    name  = ucsdet_getName(match, &status);
749b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (strcmp(name, "IBM420_rtl") != 0) {
750b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
751b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
752b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
753b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_setText(csd, bytes_r, brLength, &status);
754b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    match = ucsdet_detect(csd, &status);
755b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
756b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (match == NULL) {
757b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
758b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto bail;
759b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
760b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
761b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    name  = ucsdet_getName(match, &status);
762b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (strcmp(name, "IBM420_ltr") != 0) {
763b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
764b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
765b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
766b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querubail:
767b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    freeBytes(bytes);
768b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    freeBytes(bytes_r);
769b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_close(csd);
7701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif
771b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
772b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
773b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
774b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid CharsetDetectionTest::Ticket6394Test() {
775b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
776b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
777b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
778b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                             "encodings more than once.  The hop through UnicodeString is for platforms "
779b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
780b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    char latin1Text[sizeof(charText)];
781b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
782b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
783b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
784b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UCharsetDetector *csd = ucsdet_open(&status);
785b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_setText(csd, latin1Text, -1, &status);
786b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(status)) {
787b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
788b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        return;
789b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
790b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
791b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t matchCount = 0;
792b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
793b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(status)) {
794b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
795b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        return;
796b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
797b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
798b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
799b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t i;
800b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    for (i=0; i<matchCount; i++) {
801b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
802b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (U_FAILURE(status)) {
803b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
804b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            status = U_ZERO_ERROR;
805b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
806b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (setOfCharsetNames.contains(charSetName)) {
807b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
808b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
809b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
810b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        setOfCharsetNames.add(charSetName);
811b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
812b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ucsdet_close(csd);
813b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif
814b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
81654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
81754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
81854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius//               similar Windows and non-Windows SBCS encodings. State was kept in the shared
81954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius//               Charset Recognizer objects, and could be overwritten.
82054dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusvoid CharsetDetectionTest::Ticket6954Test() {
82159d709d503bab6e2b61931737e662dd293b40578ccornelius#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
82254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UErrorCode status = U_ZERO_ERROR;
82354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
82454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
82554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                            "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
82654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UnicodeString sWindows  = ssWindows.unescape();
82754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t lISO = 0, lWindows = 0;
82854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
82954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
83054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
83154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // First do a plain vanilla detect of 1252 text
83254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
83354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UCharsetDetector *csd1 = ucsdet_open(&status);
83454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    ucsdet_setText(csd1, bWindows, lWindows, &status);
83554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
83654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const char *name1 = ucsdet_getName(match1, &status);
83754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    TEST_ASSERT_SUCCESS(status);
83854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
83954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
84054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Next, using a completely separate detector, detect some 8859-1 text
84154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
84254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UCharsetDetector *csd2 = ucsdet_open(&status);
84354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    ucsdet_setText(csd2, bISO, lISO, &status);
84454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
84554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const char *name2 = ucsdet_getName(match2, &status);
84654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    TEST_ASSERT_SUCCESS(status);
84754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
84854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
84954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Recheck the 1252 results from the first detector, which should not have been
85054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    //  altered by the use of a different detector.
85154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
85254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    name1 = ucsdet_getName(match1, &status);
85354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    TEST_ASSERT_SUCCESS(status);
85454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
85554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
85654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    ucsdet_close(csd1);
85754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    ucsdet_close(csd2);
85854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    freeBytes(bISO);
85954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    freeBytes(bWindows);
86054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif
86154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}
862