1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8
9#include "unicode/utypes.h"
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/unistr.h"
13#include "unicode/putil.h"
14#include "unicode/uniset.h"
15
16#include "intltest.h"
17#include "csdetest.h"
18
19#include "xmlparser.h"
20
21#include <stdlib.h>
22#include <string.h>
23
24#ifdef DEBUG_DETECT
25#include <stdio.h>
26#endif
27
28#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29
30#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33#define CH_SPACE 0x0020
34#define CH_SLASH 0x002F
35
36#define TEST_ASSERT(x) {if (!(x)) { \
37    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38
39#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41    return;}}
42
43
44//---------------------------------------------------------------------------
45//
46//  Test class boilerplate
47//
48//---------------------------------------------------------------------------
49CharsetDetectionTest::CharsetDetectionTest()
50{
51}
52
53
54CharsetDetectionTest::~CharsetDetectionTest()
55{
56}
57
58
59
60void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61{
62    if (exec) logln("TestSuite CharsetDetectionTest: ");
63    switch (index) {
64       case 0: name = "ConstructionTest";
65            if (exec) ConstructionTest();
66            break;
67
68       case 1: name = "UTF8Test";
69            if (exec) UTF8Test();
70            break;
71
72       case 2: name = "UTF16Test";
73            if (exec) UTF16Test();
74            break;
75
76       case 3: name = "C1BytesTest";
77            if (exec) C1BytesTest();
78            break;
79
80       case 4: name = "InputFilterTest";
81            if (exec) InputFilterTest();
82            break;
83
84       case 5: name = "DetectionTest";
85            if (exec) DetectionTest();
86            break;
87#if !UCONFIG_NO_LEGACY_CONVERSION
88       case 6: name = "IBM424Test";
89            if (exec) IBM424Test();
90            break;
91
92       case 7: name = "IBM420Test";
93            if (exec) IBM420Test();
94            break;
95#else
96       case 6:
97       case 7: name = "skip"; break;
98#endif
99       case 8: name = "Ticket6394Test";
100            if (exec) Ticket6394Test();
101            break;
102
103       case 9: name = "Ticket6954Test";
104            if (exec) Ticket6954Test();
105            break;
106
107        default: name = "";
108            break; //needed to end loop
109    }
110}
111
112static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113{
114    int32_t offset = -1;
115
116    splits = 1;
117    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118        splits += 1;
119    }
120
121    UnicodeString *result = new UnicodeString[splits];
122
123    int32_t start = 0;
124    int32_t split = 0;
125    int32_t end;
126
127    while((end = src.indexOf(ch, start)) >= 0) {
128        src.extractBetween(start, end, result[split++]);
129        start = end + 1;
130    }
131
132    src.extractBetween(start, src.length(), result[split]);
133
134    return result;
135}
136
137static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138{
139    int32_t sLength = source.length();
140    char *bytes = NULL;
141
142    length = source.extract(0, sLength, NULL, codepage);
143
144    if (length > 0) {
145        bytes = NEW_ARRAY(char, length + 1);
146        source.extract(0, sLength, bytes, codepage);
147    }
148
149    return bytes;
150}
151
152static void freeBytes(char *bytes)
153{
154    DELETE_ARRAY(bytes);
155}
156
157void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158{
159    int32_t splits = 0;
160    int32_t testLength = testString.length();
161    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162    UErrorCode status = U_ZERO_ERROR;
163    int32_t cpLength = eSplit[0].length();
164    char codepage[64];
165
166    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167    codepage[cpLength] = '\0';
168
169    LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170
171    int32_t byteLength = 0;
172    char *bytes = extractBytes(testString, codepage, byteLength);
173
174    if (bytes == NULL) {
175#if !UCONFIG_NO_LEGACY_CONVERSION
176        dataerrln("Can't open a " + encoding + " converter for " + id);
177#endif
178        return;
179    }
180
181    ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182
183    int32_t matchCount = 0;
184    const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185
186
187    UnicodeString name(ucsdet_getName(matches[0], &status));
188    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189    UChar *decoded = NULL;
190    int32_t dLength = 0;
191
192    if (matchCount == 0) {
193        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194        goto bail;
195    }
196
197    if (name.compare(eSplit[0]) != 0) {
198        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199
200#ifdef DEBUG_DETECT
201        for (int32_t m = 0; m < matchCount; m += 1) {
202            const char *name = ucsdet_getName(matches[m], &status);
203            const char *lang = ucsdet_getLanguage(matches[m], &status);
204            int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205
206            printf("%s (%s) %d\n", name, lang, confidence);
207        }
208#endif
209        goto bail;
210    }
211
212    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214        goto bail;
215    }
216
217    decoded = NEW_ARRAY(UChar, testLength);
218    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219
220    if (testString.compare(decoded, dLength) != 0) {
221        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222
223#ifdef DEBUG_DETECT
224        for(int32_t i = 0; i < testLength; i += 1) {
225            if(testString[i] != decoded[i]) {
226                printf("Strings differ at byte %d\n", i);
227                break;
228            }
229        }
230#endif
231
232    }
233
234    DELETE_ARRAY(decoded);
235
236bail:
237    freeBytes(bytes);
238    delete[] eSplit;
239}
240
241const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242    UErrorCode status = U_ZERO_ERROR;
243    const char *testDataDirectory = IntlTest::getSourceTestData(status);
244
245    if (U_FAILURE(status)) {
246        errln("ERROR: getPath() failed - %s", u_errorName(status));
247        return NULL;
248    }
249
250    strcpy(buffer, testDataDirectory);
251    strcat(buffer, filename);
252    return buffer;
253}
254
255void CharsetDetectionTest::ConstructionTest()
256{
257    IcuTestErrorCode status(*this, "ConstructionTest");
258    LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259    LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260    int32_t count = uenum_count(e.getAlias(), status);
261
262#ifdef DEBUG_DETECT
263    printf("There are %d recognizers.\n", count);
264#endif
265
266    for(int32_t i = 0; i < count; i += 1) {
267        int32_t length;
268        const char *name = uenum_next(e.getAlias(), &length, status);
269
270        if(name == NULL || length <= 0) {
271            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272        }
273
274#ifdef DEBUG_DETECT
275        printf("%s\n", name);
276#endif
277    }
278
279    const char* defDisabled[] = {
280        "IBM420_rtl", "IBM420_ltr",
281        "IBM424_rtl", "IBM424_ltr",
282        0
283    };
284
285    LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286    const char *activeName = NULL;
287
288    while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289        // the charset must be included in all list
290        UBool found = FALSE;
291
292        const char *name = NULL;
293        uenum_reset(e.getAlias(), status);
294        while ((name = uenum_next(e.getAlias(), NULL, status))) {
295            if (strcmp(activeName, name) == 0) {
296                found = TRUE;
297                break;
298            }
299        }
300
301        if (!found) {
302            errln(UnicodeString(activeName) + " is not included in the all charset list.");
303        }
304
305        // some charsets are disabled by default
306        found = FALSE;
307        for (int32_t i = 0; defDisabled[i] != 0; i++) {
308            if (strcmp(activeName, defDisabled[i]) == 0) {
309                found = TRUE;
310                break;
311            }
312        }
313        if (found) {
314            errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315        }
316    }
317}
318
319void CharsetDetectionTest::UTF8Test()
320{
321    UErrorCode status = U_ZERO_ERROR;
322    UnicodeString ss = "This is a string with some non-ascii characters that will "
323                       "be converted to UTF-8, then shoved through the detection process.  "
324                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
325                       "Sure would be nice if our source could contain Unicode directly!";
326    UnicodeString s = ss.unescape();
327    int32_t byteLength = 0, sLength = s.length();
328    char *bytes = extractBytes(s, "UTF-8", byteLength);
329    UCharsetDetector *csd = ucsdet_open(&status);
330    const UCharsetMatch *match;
331    UChar *detected = NEW_ARRAY(UChar, sLength);
332
333    ucsdet_setText(csd, bytes, byteLength, &status);
334    match = ucsdet_detect(csd, &status);
335
336    if (match == NULL) {
337        errln("Detection failure for UTF-8: got no matches.");
338        goto bail;
339    }
340
341    ucsdet_getUChars(match, detected, sLength, &status);
342
343    if (s.compare(detected, sLength) != 0) {
344        errln("Round-trip test failed!");
345    }
346
347    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348
349bail:
350    DELETE_ARRAY(detected);
351    freeBytes(bytes);
352    ucsdet_close(csd);
353}
354
355void CharsetDetectionTest::UTF16Test()
356{
357    UErrorCode status = U_ZERO_ERROR;
358    /* Notice the BOM on the start of this string */
359    UChar chars[] = {
360        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364        0x064a, 0x062a, 0x0000};
365    UnicodeString s(chars);
366    int32_t beLength = 0, leLength = 0;
367    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
368    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
369    UCharsetDetector *csd = ucsdet_open(&status);
370    const UCharsetMatch *match;
371    const char *name;
372    int32_t conf;
373
374    ucsdet_setText(csd, beBytes, beLength, &status);
375    match = ucsdet_detect(csd, &status);
376
377    if (match == NULL) {
378        errln("Encoding detection failure for UTF-16BE: got no matches.");
379        goto try_le;
380    }
381
382    name  = ucsdet_getName(match, &status);
383    conf  = ucsdet_getConfidence(match, &status);
384
385    if (strcmp(name, "UTF-16BE") != 0) {
386        errln("Encoding detection failure for UTF-16BE: got %s", name);
387        goto try_le; // no point in looking at confidence if we got the wrong character set.
388    }
389
390    if (conf != 100) {
391        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
392    }
393
394try_le:
395    ucsdet_setText(csd, leBytes, leLength, &status);
396    match = ucsdet_detect(csd, &status);
397
398    if (match == NULL) {
399        errln("Encoding detection failure for UTF-16LE: got no matches.");
400        goto bail;
401    }
402
403    name  = ucsdet_getName(match, &status);
404    conf = ucsdet_getConfidence(match, &status);
405
406
407    if (strcmp(name, "UTF-16LE") != 0) {
408        errln("Enconding detection failure for UTF-16LE: got %s", name);
409        goto bail; // no point in looking at confidence if we got the wrong character set.
410    }
411
412    if (conf != 100) {
413        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
414    }
415
416bail:
417    freeBytes(leBytes);
418    freeBytes(beBytes);
419    ucsdet_close(csd);
420}
421
422void CharsetDetectionTest::InputFilterTest()
423{
424    UErrorCode status = U_ZERO_ERROR;
425    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426    UnicodeString s  = ss.unescape();
427    int32_t byteLength = 0;
428    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
429    UCharsetDetector *csd = ucsdet_open(&status);
430    const UCharsetMatch *match;
431    const char *lang, *name;
432
433    ucsdet_enableInputFilter(csd, TRUE);
434
435    if (!ucsdet_isInputFilterEnabled(csd)) {
436        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
437    }
438
439
440    ucsdet_setText(csd, bytes, byteLength, &status);
441    match = ucsdet_detect(csd, &status);
442
443    if (match == NULL) {
444        errln("Turning on the input filter resulted in no matches.");
445        goto turn_off;
446    }
447
448    name = ucsdet_getName(match, &status);
449
450    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
451        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
452    } else {
453        lang = ucsdet_getLanguage(match, &status);
454
455        if (lang == NULL || strcmp(lang, "fr") != 0) {
456            errln("Input filter did not strip markup!");
457        }
458    }
459
460turn_off:
461    ucsdet_enableInputFilter(csd, FALSE);
462    ucsdet_setText(csd, bytes, byteLength, &status);
463    match = ucsdet_detect(csd, &status);
464
465    if (match == NULL) {
466        errln("Turning off the input filter resulted in no matches.");
467        goto bail;
468    }
469
470    name = ucsdet_getName(match, &status);
471
472    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
473        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
474    } else {
475        lang = ucsdet_getLanguage(match, &status);
476
477        if (lang == NULL || strcmp(lang, "en") != 0) {
478            errln("Unfiltered input did not detect as English!");
479        }
480    }
481
482bail:
483    freeBytes(bytes);
484    ucsdet_close(csd);
485}
486
487void CharsetDetectionTest::C1BytesTest()
488{
489#if !UCONFIG_NO_LEGACY_CONVERSION
490    UErrorCode status = U_ZERO_ERROR;
491    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
493    UnicodeString sWindows  = ssWindows.unescape();
494    int32_t lISO = 0, lWindows = 0;
495    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
496    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
497    UCharsetDetector *csd = ucsdet_open(&status);
498    const UCharsetMatch *match;
499    const char *name;
500
501    ucsdet_setText(csd, bWindows, lWindows, &status);
502    match = ucsdet_detect(csd, &status);
503
504    if (match == NULL) {
505        errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
506        goto bail;
507    }
508
509    name  = ucsdet_getName(match, &status);
510
511    if (strcmp(name, "windows-1252") != 0) {
512        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
513    }
514
515    ucsdet_setText(csd, bISO, lISO, &status);
516    match = ucsdet_detect(csd, &status);
517
518    if (match == NULL) {
519        errln("English text without C1 bytes got no matches.");
520        goto bail;
521    }
522
523    name  = ucsdet_getName(match, &status);
524
525    if (strcmp(name, "ISO-8859-1") != 0) {
526        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
527    }
528
529bail:
530    freeBytes(bWindows);
531    freeBytes(bISO);
532
533    ucsdet_close(csd);
534#endif
535}
536
537void CharsetDetectionTest::DetectionTest()
538{
539#if !UCONFIG_NO_REGULAR_EXPRESSIONS
540    UErrorCode status = U_ZERO_ERROR;
541    char path[2048];
542    const char *testFilePath = getPath(path, "csdetest.xml");
543
544    if (testFilePath == NULL) {
545        return; /* Couldn't get path: error message already output. */
546    }
547
548    UXMLParser  *parser = UXMLParser::createParser(status);
549    if (U_FAILURE(status)) {
550        dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
551        return;
552    }
553
554    UXMLElement *root   = parser->parseFile(testFilePath, status);
555    if (!assertSuccess( "parseFile",status)) return;
556
557    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
558    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
559    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
560
561    const UXMLElement *testCase;
562    int32_t tc = 0;
563
564    while((testCase = root->nextChildElement(tc)) != NULL) {
565        if (testCase->getTagName().compare(test_case) == 0) {
566            const UnicodeString *id = testCase->getAttribute(id_attr);
567            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
568            const UnicodeString  text = testCase->getText(TRUE);
569            int32_t encodingCount;
570            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
571
572            for(int32_t e = 0; e < encodingCount; e += 1) {
573                checkEncoding(text, encodingList[e], *id);
574            }
575
576            delete[] encodingList;
577        }
578    }
579
580    delete root;
581    delete parser;
582#endif
583}
584
585void CharsetDetectionTest::IBM424Test()
586{
587    UErrorCode status = U_ZERO_ERROR;
588
589    static const UChar chars[] = {
590            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
591            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
592            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
593            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
594            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
595            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
596            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
597            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
598            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
599            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
600            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
601            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
602            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
603            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
604            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
605            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
606            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
607    };
608
609    static const UChar chars_reverse[] = {
610            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
611            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
612            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
613            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
614            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
615            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
616            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
617            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
618            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
619            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
620            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
621            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
622            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
623            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
624            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
625            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
626            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
627            0x0000
628    };
629
630    int32_t bLength = 0, brLength = 0;
631
632    UnicodeString s1(chars);
633    UnicodeString s2(chars_reverse);
634
635    char *bytes = extractBytes(s1, "IBM424", bLength);
636    char *bytes_r = extractBytes(s2, "IBM424", brLength);
637
638    UCharsetDetector *csd = ucsdet_open(&status);
639	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
640	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
641	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
642	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
643    if (U_FAILURE(status)) {
644        errln("Error opening charset detector. - %s", u_errorName(status));
645    }
646    const UCharsetMatch *match;
647    const char *name;
648
649    ucsdet_setText(csd, bytes, bLength, &status);
650    match = ucsdet_detect(csd, &status);
651
652    if (match == NULL) {
653        errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
654        goto bail;
655    }
656
657    name  = ucsdet_getName(match, &status);
658    if (strcmp(name, "IBM424_rtl") != 0) {
659        errln("Encoding detection failure for IBM424_rtl: got %s", name);
660    }
661
662    ucsdet_setText(csd, bytes_r, brLength, &status);
663    match = ucsdet_detect(csd, &status);
664
665    if (match == NULL) {
666        errln("Encoding detection failure for IBM424_ltr: got no matches.");
667        goto bail;
668    }
669
670    name  = ucsdet_getName(match, &status);
671    if (strcmp(name, "IBM424_ltr") != 0) {
672        errln("Encoding detection failure for IBM424_ltr: got %s", name);
673    }
674
675bail:
676    freeBytes(bytes);
677    freeBytes(bytes_r);
678    ucsdet_close(csd);
679}
680
681void CharsetDetectionTest::IBM420Test()
682{
683    UErrorCode status = U_ZERO_ERROR;
684
685    static const UChar chars[] = {
686        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
687        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
688        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
689        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
690        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
691        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
692        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
693        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
694        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
695        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
696        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
697        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
698        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
699        0x0000
700    };
701    static const UChar chars_reverse[] = {
702        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
703        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
704        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
705        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
706        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
707        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
708        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
709        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
710        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
711        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
712        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
713        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
714        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
715        0x0000,
716    };
717
718    int32_t bLength = 0, brLength = 0;
719
720    UnicodeString s1(chars);
721    UnicodeString s2(chars_reverse);
722
723    char *bytes = extractBytes(s1, "IBM420", bLength);
724    char *bytes_r = extractBytes(s2, "IBM420", brLength);
725
726    UCharsetDetector *csd = ucsdet_open(&status);
727    if (U_FAILURE(status)) {
728        errln("Error opening charset detector. - %s", u_errorName(status));
729    }
730	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
731	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
732	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
733	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
734    const UCharsetMatch *match;
735    const char *name;
736
737    ucsdet_setText(csd, bytes, bLength, &status);
738    match = ucsdet_detect(csd, &status);
739
740    if (match == NULL) {
741        errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
742        goto bail;
743    }
744
745    name  = ucsdet_getName(match, &status);
746    if (strcmp(name, "IBM420_rtl") != 0) {
747        errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
748    }
749
750    ucsdet_setText(csd, bytes_r, brLength, &status);
751    match = ucsdet_detect(csd, &status);
752
753    if (match == NULL) {
754        errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
755        goto bail;
756    }
757
758    name  = ucsdet_getName(match, &status);
759    if (strcmp(name, "IBM420_ltr") != 0) {
760        errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
761    }
762
763bail:
764    freeBytes(bytes);
765    freeBytes(bytes_r);
766    ucsdet_close(csd);
767}
768
769
770void CharsetDetectionTest::Ticket6394Test() {
771#if !UCONFIG_NO_CONVERSION
772    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
773                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
774                             "encodings more than once.  The hop through UnicodeString is for platforms "
775                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
776    char latin1Text[sizeof(charText)];
777    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
778
779    UErrorCode status = U_ZERO_ERROR;
780    UCharsetDetector *csd = ucsdet_open(&status);
781    ucsdet_setText(csd, latin1Text, -1, &status);
782    if (U_FAILURE(status)) {
783        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
784        return;
785    }
786
787    int32_t matchCount = 0;
788    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
789    if (U_FAILURE(status)) {
790        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
791        return;
792    }
793
794    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
795    int32_t i;
796    for (i=0; i<matchCount; i++) {
797        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
798        if (U_FAILURE(status)) {
799            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
800            status = U_ZERO_ERROR;
801        }
802        if (setOfCharsetNames.contains(charSetName)) {
803            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
804            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
805        }
806        setOfCharsetNames.add(charSetName);
807    }
808    ucsdet_close(csd);
809#endif
810}
811
812
813// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
814//               similar Windows and non-Windows SBCS encodings. State was kept in the shared
815//               Charset Recognizer objects, and could be overwritten.
816void CharsetDetectionTest::Ticket6954Test() {
817#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
818    UErrorCode status = U_ZERO_ERROR;
819    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
820    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
821                            "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
822    UnicodeString sWindows  = ssWindows.unescape();
823    int32_t lISO = 0, lWindows = 0;
824    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
825    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
826
827    // First do a plain vanilla detect of 1252 text
828
829    UCharsetDetector *csd1 = ucsdet_open(&status);
830    ucsdet_setText(csd1, bWindows, lWindows, &status);
831    const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
832    const char *name1 = ucsdet_getName(match1, &status);
833    TEST_ASSERT_SUCCESS(status);
834    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
835
836    // Next, using a completely separate detector, detect some 8859-1 text
837
838    UCharsetDetector *csd2 = ucsdet_open(&status);
839    ucsdet_setText(csd2, bISO, lISO, &status);
840    const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
841    const char *name2 = ucsdet_getName(match2, &status);
842    TEST_ASSERT_SUCCESS(status);
843    TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
844
845    // Recheck the 1252 results from the first detector, which should not have been
846    //  altered by the use of a different detector.
847
848    name1 = ucsdet_getName(match1, &status);
849    TEST_ASSERT_SUCCESS(status);
850    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
851
852    ucsdet_close(csd1);
853    ucsdet_close(csd2);
854    freeBytes(bISO);
855    freeBytes(bWindows);
856#endif
857}
858