csdetest.cpp revision 6d5deb12725f146643d443090dfa11b206df528a
1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8
9#include "unicode/utypes.h"
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/unistr.h"
13#include "unicode/putil.h"
14#include "unicode/uniset.h"
15
16#include "intltest.h"
17#include "csdetest.h"
18
19#include "xmlparser.h"
20
21#include <stdlib.h>
22#include <string.h>
23
24#ifdef DEBUG_DETECT
25#include <stdio.h>
26#endif
27
28#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29
30#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33#define CH_SPACE 0x0020
34#define CH_SLASH 0x002F
35
36//---------------------------------------------------------------------------
37//
38//  Test class boilerplate
39//
40//---------------------------------------------------------------------------
41CharsetDetectionTest::CharsetDetectionTest()
42{
43}
44
45
46CharsetDetectionTest::~CharsetDetectionTest()
47{
48}
49
50
51
52void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
53{
54    if (exec) logln("TestSuite CharsetDetectionTest: ");
55    switch (index) {
56       case 0: name = "ConstructionTest";
57            if (exec) ConstructionTest();
58            break;
59
60       case 1: name = "UTF8Test";
61            if (exec) UTF8Test();
62            break;
63
64       case 2: name = "UTF16Test";
65            if (exec) UTF16Test();
66            break;
67
68       case 3: name = "C1BytesTest";
69            if (exec) C1BytesTest();
70            break;
71
72       case 4: name = "InputFilterTest";
73            if (exec) InputFilterTest();
74            break;
75
76       case 5: name = "DetectionTest";
77            if (exec) DetectionTest();
78            break;
79
80       case 6: name = "IBM424Test";
81            if (exec) IBM424Test();
82            break;
83
84       case 7: name = "IBM420Test";
85            if (exec) IBM420Test();
86            break;
87
88       case 8: name = "Ticket6394Test";
89            if (exec) Ticket6394Test();
90            break;
91
92        default: name = "";
93            break; //needed to end loop
94    }
95}
96
97static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
98{
99    int32_t offset = -1;
100
101    splits = 1;
102    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
103        splits += 1;
104    }
105
106    UnicodeString *result = new UnicodeString[splits];
107
108    int32_t start = 0;
109    int32_t split = 0;
110    int32_t end;
111
112    while((end = src.indexOf(ch, start)) >= 0) {
113        src.extractBetween(start, end, result[split++]);
114        start = end + 1;
115    }
116
117    src.extractBetween(start, src.length(), result[split]);
118
119    return result;
120}
121
122static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
123{
124    int32_t sLength = source.length();
125    char *bytes = NULL;
126
127    length = source.extract(0, sLength, NULL, codepage);
128
129    if (length > 0) {
130        bytes = NEW_ARRAY(char, length + 1);
131        source.extract(0, sLength, bytes, codepage);
132    }
133
134    return bytes;
135}
136
137static void freeBytes(char *bytes)
138{
139    DELETE_ARRAY(bytes);
140}
141
142void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
143{
144    int32_t splits = 0;
145    int32_t testLength = testString.length();
146    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
147    UErrorCode status = U_ZERO_ERROR;
148    int32_t cpLength = eSplit[0].length();
149    char codepage[64];
150
151    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
152    codepage[cpLength] = '\0';
153
154    UCharsetDetector *csd = ucsdet_open(&status);
155
156    int32_t byteLength = 0;
157    char *bytes = extractBytes(testString, codepage, byteLength);
158
159    if (bytes == NULL) {
160#if !UCONFIG_NO_LEGACY_CONVERSION
161        errln("Can't open a " + encoding + " converter for " + id);
162#endif
163        return;
164    }
165
166    ucsdet_setText(csd, bytes, byteLength, &status);
167
168    int32_t matchCount = 0;
169    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
170
171
172    UnicodeString name(ucsdet_getName(matches[0], &status));
173    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
174    UChar *decoded = NULL;
175    int32_t dLength = 0;
176
177    if (matchCount == 0) {
178        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
179        goto bail;
180    }
181
182    if (name.compare(eSplit[0]) != 0) {
183        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
184
185#ifdef DEBUG_DETECT
186        for (int32_t m = 0; m < matchCount; m += 1) {
187            const char *name = ucsdet_getName(matches[m], &status);
188            const char *lang = ucsdet_getLanguage(matches[m], &status);
189            int32_t confidence = ucsdet_getConfidence(matches[m], &status);
190
191            printf("%s (%s) %d\n", name, lang, confidence);
192        }
193#endif
194        goto bail;
195    }
196
197    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
198        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
199        goto bail;
200    }
201
202    decoded = NEW_ARRAY(UChar, testLength);
203    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
204
205    if (testString.compare(decoded, dLength) != 0) {
206        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
207
208#ifdef DEBUG_DETECT
209        for(int32_t i = 0; i < testLength; i += 1) {
210            if(testString[i] != decoded[i]) {
211                printf("Strings differ at byte %d\n", i);
212                break;
213            }
214        }
215#endif
216
217    }
218
219    DELETE_ARRAY(decoded);
220
221bail:
222    freeBytes(bytes);
223    ucsdet_close(csd);
224    delete[] eSplit;
225}
226
227const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
228    UErrorCode status = U_ZERO_ERROR;
229    const char *testDataDirectory = IntlTest::getSourceTestData(status);
230
231    if (U_FAILURE(status)) {
232        errln("ERROR: getPath() failed - %s", u_errorName(status));
233        return NULL;
234    }
235
236    strcpy(buffer, testDataDirectory);
237    strcat(buffer, filename);
238    return buffer;
239}
240
241void CharsetDetectionTest::ConstructionTest()
242{
243    UErrorCode status = U_ZERO_ERROR;
244    UCharsetDetector *csd = ucsdet_open(&status);
245    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
246    int32_t count = uenum_count(e, &status);
247
248#ifdef DEBUG_DETECT
249    printf("There are %d recognizers.\n", count);
250#endif
251
252    for(int32_t i = 0; i < count; i += 1) {
253        int32_t length;
254        const char *name = uenum_next(e, &length, &status);
255
256        if(name == NULL || length <= 0) {
257            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
258        }
259
260#ifdef DEBUG_DETECT
261        printf("%s\n", name);
262#endif
263    }
264
265    uenum_close(e);
266    ucsdet_close(csd);
267}
268
269void CharsetDetectionTest::UTF8Test()
270{
271    UErrorCode status = U_ZERO_ERROR;
272    UnicodeString ss = "This is a string with some non-ascii characters that will "
273                       "be converted to UTF-8, then shoved through the detection process.  "
274                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
275                       "Sure would be nice if our source could contain Unicode directly!";
276    UnicodeString s = ss.unescape();
277    int32_t byteLength = 0, sLength = s.length();
278    char *bytes = extractBytes(s, "UTF-8", byteLength);
279    UCharsetDetector *csd = ucsdet_open(&status);
280    const UCharsetMatch *match;
281    UChar *detected = NEW_ARRAY(UChar, sLength);
282
283    ucsdet_setText(csd, bytes, byteLength, &status);
284    match = ucsdet_detect(csd, &status);
285
286    if (match == NULL) {
287        errln("Detection failure for UTF-8: got no matches.");
288        goto bail;
289    }
290
291    ucsdet_getUChars(match, detected, sLength, &status);
292
293    if (s.compare(detected, sLength) != 0) {
294        errln("Round-trip test failed!");
295    }
296
297    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
298
299bail:
300    DELETE_ARRAY(detected);
301    freeBytes(bytes);
302    ucsdet_close(csd);
303}
304
305void CharsetDetectionTest::UTF16Test()
306{
307    UErrorCode status = U_ZERO_ERROR;
308    /* Notice the BOM on the start of this string */
309    UChar chars[] = {
310        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
311        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
312        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
313        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
314        0x064a, 0x062a, 0x0000};
315    UnicodeString s(chars);
316    int32_t beLength = 0, leLength = 0;
317    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
318    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
319    UCharsetDetector *csd = ucsdet_open(&status);
320    const UCharsetMatch *match;
321    const char *name;
322    int32_t conf;
323
324    ucsdet_setText(csd, beBytes, beLength, &status);
325    match = ucsdet_detect(csd, &status);
326
327    if (match == NULL) {
328        errln("Encoding detection failure for UTF-16BE: got no matches.");
329        goto try_le;
330    }
331
332    name  = ucsdet_getName(match, &status);
333    conf  = ucsdet_getConfidence(match, &status);
334
335    if (strcmp(name, "UTF-16BE") != 0) {
336        errln("Encoding detection failure for UTF-16BE: got %s", name);
337        goto try_le; // no point in looking at confidence if we got the wrong character set.
338    }
339
340    if (conf != 100) {
341        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
342    }
343
344try_le:
345    ucsdet_setText(csd, leBytes, leLength, &status);
346    match = ucsdet_detect(csd, &status);
347
348    if (match == NULL) {
349        errln("Encoding detection failure for UTF-16LE: got no matches.");
350        goto bail;
351    }
352
353    name  = ucsdet_getName(match, &status);
354    conf = ucsdet_getConfidence(match, &status);
355
356
357    if (strcmp(name, "UTF-16LE") != 0) {
358        errln("Enconding detection failure for UTF-16LE: got %s", name);
359        goto bail; // no point in looking at confidence if we got the wrong character set.
360    }
361
362    if (conf != 100) {
363        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
364    }
365
366bail:
367    freeBytes(leBytes);
368    freeBytes(beBytes);
369    ucsdet_close(csd);
370}
371
372void CharsetDetectionTest::InputFilterTest()
373{
374    UErrorCode status = U_ZERO_ERROR;
375    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
376    UnicodeString s  = ss.unescape();
377    int32_t byteLength = 0;
378    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
379    UCharsetDetector *csd = ucsdet_open(&status);
380    const UCharsetMatch *match;
381    const char *lang, *name;
382
383    ucsdet_enableInputFilter(csd, TRUE);
384
385    if (!ucsdet_isInputFilterEnabled(csd)) {
386        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
387    }
388
389
390    ucsdet_setText(csd, bytes, byteLength, &status);
391    match = ucsdet_detect(csd, &status);
392
393    if (match == NULL) {
394        errln("Turning on the input filter resulted in no matches.");
395        goto turn_off;
396    }
397
398    name = ucsdet_getName(match, &status);
399
400    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
401        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
402    } else {
403        lang = ucsdet_getLanguage(match, &status);
404
405        if (lang == NULL || strcmp(lang, "fr") != 0) {
406            errln("Input filter did not strip markup!");
407        }
408    }
409
410turn_off:
411    ucsdet_enableInputFilter(csd, FALSE);
412    ucsdet_setText(csd, bytes, byteLength, &status);
413    match = ucsdet_detect(csd, &status);
414
415    if (match == NULL) {
416        errln("Turning off the input filter resulted in no matches.");
417        goto bail;
418    }
419
420    name = ucsdet_getName(match, &status);
421
422    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
423        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
424    } else {
425        lang = ucsdet_getLanguage(match, &status);
426
427        if (lang == NULL || strcmp(lang, "en") != 0) {
428            errln("Unfiltered input did not detect as English!");
429        }
430    }
431
432bail:
433    freeBytes(bytes);
434    ucsdet_close(csd);
435}
436
437void CharsetDetectionTest::C1BytesTest()
438{
439#if !UCONFIG_NO_LEGACY_CONVERSION
440    UErrorCode status = U_ZERO_ERROR;
441    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
442    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
443    UnicodeString sWindows  = ssWindows.unescape();
444    int32_t lISO = 0, lWindows = 0;
445    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
446    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
447    UCharsetDetector *csd = ucsdet_open(&status);
448    const UCharsetMatch *match;
449    const char *name;
450
451    ucsdet_setText(csd, bWindows, lWindows, &status);
452    match = ucsdet_detect(csd, &status);
453
454    if (match == NULL) {
455        errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
456        goto bail;
457    }
458
459    name  = ucsdet_getName(match, &status);
460
461    if (strcmp(name, "windows-1252") != 0) {
462        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
463    }
464
465    ucsdet_setText(csd, bISO, lISO, &status);
466    match = ucsdet_detect(csd, &status);
467
468    if (match == NULL) {
469        errln("English text without C1 bytes got no matches.");
470        goto bail;
471    }
472
473    name  = ucsdet_getName(match, &status);
474
475    if (strcmp(name, "ISO-8859-1") != 0) {
476        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
477    }
478
479bail:
480    freeBytes(bWindows);
481    freeBytes(bISO);
482
483    ucsdet_close(csd);
484#endif
485}
486
487void CharsetDetectionTest::DetectionTest()
488{
489#if !UCONFIG_NO_REGULAR_EXPRESSIONS
490    UErrorCode status = U_ZERO_ERROR;
491    char path[2048];
492    const char *testFilePath = getPath(path, "csdetest.xml");
493
494    if (testFilePath == NULL) {
495        return; /* Couldn't get path: error message already output. */
496    }
497
498    UXMLParser  *parser = UXMLParser::createParser(status);
499    if (U_FAILURE(status)) {
500        dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
501        return;
502    }
503
504    UXMLElement *root   = parser->parseFile(testFilePath, status);
505    if (!assertSuccess( "parseFile",status)) return;
506
507    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
508    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
509    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
510
511    const UXMLElement *testCase;
512    int32_t tc = 0;
513
514    while((testCase = root->nextChildElement(tc)) != NULL) {
515        if (testCase->getTagName().compare(test_case) == 0) {
516            const UnicodeString *id = testCase->getAttribute(id_attr);
517            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
518            const UnicodeString  text = testCase->getText(TRUE);
519            int32_t encodingCount;
520            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
521
522            for(int32_t e = 0; e < encodingCount; e += 1) {
523                checkEncoding(text, encodingList[e], *id);
524            }
525
526            delete[] encodingList;
527        }
528    }
529
530    delete root;
531    delete parser;
532#endif
533}
534
535void CharsetDetectionTest::IBM424Test()
536{
537    UErrorCode status = U_ZERO_ERROR;
538
539    static const UChar chars[] = {
540            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
541            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
542            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
543            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
544            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
545            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
546            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
547            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
548            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
549            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
550            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
551            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
552            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
553            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
554            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
555            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
556            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
557    };
558
559    static const UChar chars_reverse[] = {
560            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
561            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
562            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
563            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
564            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
565            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
566            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
567            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
568            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
569            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
570            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
571            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
572            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
573            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
574            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
575            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
576            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
577            0x0000
578    };
579
580    int32_t bLength = 0, brLength = 0;
581
582    UnicodeString s1(chars);
583    UnicodeString s2(chars_reverse);
584
585    char *bytes = extractBytes(s1, "IBM424", bLength);
586    char *bytes_r = extractBytes(s2, "IBM424", brLength);
587
588    UCharsetDetector *csd = ucsdet_open(&status);
589    if (U_FAILURE(status)) {
590        errln("Error opening charset detector. - %s", u_errorName(status));
591    }
592    const UCharsetMatch *match;
593    const char *name;
594
595    ucsdet_setText(csd, bytes, bLength, &status);
596    match = ucsdet_detect(csd, &status);
597
598    if (match == NULL) {
599        errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
600        goto bail;
601    }
602
603    name  = ucsdet_getName(match, &status);
604    if (strcmp(name, "IBM424_rtl") != 0) {
605        errln("Encoding detection failure for IBM424_rtl: got %s", name);
606    }
607
608    ucsdet_setText(csd, bytes_r, brLength, &status);
609    match = ucsdet_detect(csd, &status);
610
611    if (match == NULL) {
612        errln("Encoding detection failure for IBM424_ltr: got no matches.");
613        goto bail;
614    }
615
616    name  = ucsdet_getName(match, &status);
617    if (strcmp(name, "IBM424_ltr") != 0) {
618        errln("Encoding detection failure for IBM424_ltr: got %s", name);
619    }
620
621bail:
622    freeBytes(bytes);
623    freeBytes(bytes_r);
624    ucsdet_close(csd);
625}
626
627void CharsetDetectionTest::IBM420Test()
628{
629    UErrorCode status = U_ZERO_ERROR;
630
631    static const UChar chars[] = {
632        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
633        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
634        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
635        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
636        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
637        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
638        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
639        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
640        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
641        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
642        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
643        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
644        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
645        0x0000
646    };
647    static const UChar chars_reverse[] = {
648        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
649        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
650        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
651        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
652        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
653        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
654        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
655        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
656        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
657        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
658        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
659        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
660        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
661        0x0000,
662    };
663
664    int32_t bLength = 0, brLength = 0;
665
666    UnicodeString s1(chars);
667    UnicodeString s2(chars_reverse);
668
669    char *bytes = extractBytes(s1, "IBM420", bLength);
670    char *bytes_r = extractBytes(s2, "IBM420", brLength);
671
672    UCharsetDetector *csd = ucsdet_open(&status);
673    if (U_FAILURE(status)) {
674        errln("Error opening charset detector. - %s", u_errorName(status));
675    }
676    const UCharsetMatch *match;
677    const char *name;
678
679    ucsdet_setText(csd, bytes, bLength, &status);
680    match = ucsdet_detect(csd, &status);
681
682    if (match == NULL) {
683        errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
684        goto bail;
685    }
686
687    name  = ucsdet_getName(match, &status);
688    if (strcmp(name, "IBM420_rtl") != 0) {
689        errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
690    }
691
692    ucsdet_setText(csd, bytes_r, brLength, &status);
693    match = ucsdet_detect(csd, &status);
694
695    if (match == NULL) {
696        errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
697        goto bail;
698    }
699
700    name  = ucsdet_getName(match, &status);
701    if (strcmp(name, "IBM420_ltr") != 0) {
702        errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
703    }
704
705bail:
706    freeBytes(bytes);
707    freeBytes(bytes_r);
708    ucsdet_close(csd);
709}
710
711
712void CharsetDetectionTest::Ticket6394Test() {
713#if !UCONFIG_NO_CONVERSION
714    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
715                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
716                             "encodings more than once.  The hop through UnicodeString is for platforms "
717                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
718    char latin1Text[sizeof(charText)];
719    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
720
721    UErrorCode status = U_ZERO_ERROR;
722    UCharsetDetector *csd = ucsdet_open(&status);
723    ucsdet_setText(csd, latin1Text, -1, &status);
724    if (U_FAILURE(status)) {
725        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
726        return;
727    }
728
729    int32_t matchCount = 0;
730    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
731    if (U_FAILURE(status)) {
732        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
733        return;
734    }
735
736    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
737    int32_t i;
738    for (i=0; i<matchCount; i++) {
739        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
740        if (U_FAILURE(status)) {
741            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
742            status = U_ZERO_ERROR;
743        }
744        if (setOfCharsetNames.contains(charSetName)) {
745            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
746            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
747        }
748        setOfCharsetNames.add(charSetName);
749    }
750    ucsdet_close(csd);
751#endif
752}
753
754