1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **********************************************************************
5 *   Copyright (C) 2005-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9
10
11#include "unicode/utypes.h"
12#include "unicode/ucsdet.h"
13#include "unicode/ucnv.h"
14#include "unicode/unistr.h"
15#include "unicode/putil.h"
16#include "unicode/uniset.h"
17
18#include "intltest.h"
19#include "csdetest.h"
20
21#include "xmlparser.h"
22
23#include <stdlib.h>
24#include <string.h>
25
26#ifdef DEBUG_DETECT
27#include <stdio.h>
28#endif
29
30#define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31#define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33#define CH_SPACE 0x0020
34#define CH_SLASH 0x002F
35
36#define TEST_ASSERT(x) {if (!(x)) { \
37    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38
39#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41    return;}}
42
43
44//---------------------------------------------------------------------------
45//
46//  Test class boilerplate
47//
48//---------------------------------------------------------------------------
49CharsetDetectionTest::CharsetDetectionTest()
50{
51}
52
53
54CharsetDetectionTest::~CharsetDetectionTest()
55{
56}
57
58
59
60void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61{
62    if (exec) logln("TestSuite CharsetDetectionTest: ");
63    switch (index) {
64       case 0: name = "ConstructionTest";
65            if (exec) ConstructionTest();
66            break;
67
68       case 1: name = "UTF8Test";
69            if (exec) UTF8Test();
70            break;
71
72       case 2: name = "UTF16Test";
73            if (exec) UTF16Test();
74            break;
75
76       case 3: name = "C1BytesTest";
77            if (exec) C1BytesTest();
78            break;
79
80       case 4: name = "InputFilterTest";
81            if (exec) InputFilterTest();
82            break;
83
84       case 5: name = "DetectionTest";
85            if (exec) DetectionTest();
86            break;
87#if !UCONFIG_NO_LEGACY_CONVERSION
88       case 6: name = "IBM424Test";
89            if (exec) IBM424Test();
90            break;
91
92       case 7: name = "IBM420Test";
93            if (exec) IBM420Test();
94            break;
95#else
96       case 6:
97       case 7: name = "skip"; break;
98#endif
99       case 8: name = "Ticket6394Test";
100            if (exec) Ticket6394Test();
101            break;
102
103       case 9: name = "Ticket6954Test";
104            if (exec) Ticket6954Test();
105            break;
106
107        default: name = "";
108            break; //needed to end loop
109    }
110}
111
112static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113{
114    int32_t offset = -1;
115
116    splits = 1;
117    while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118        splits += 1;
119    }
120
121    UnicodeString *result = new UnicodeString[splits];
122
123    int32_t start = 0;
124    int32_t split = 0;
125    int32_t end;
126
127    while((end = src.indexOf(ch, start)) >= 0) {
128        src.extractBetween(start, end, result[split++]);
129        start = end + 1;
130    }
131
132    src.extractBetween(start, src.length(), result[split]);
133
134    return result;
135}
136
137static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138{
139    int32_t sLength = source.length();
140    char *bytes = NULL;
141
142    length = source.extract(0, sLength, NULL, codepage);
143
144    if (length > 0) {
145        bytes = NEW_ARRAY(char, length + 1);
146        source.extract(0, sLength, bytes, codepage);
147    }
148
149    return bytes;
150}
151
152static void freeBytes(char *bytes)
153{
154    DELETE_ARRAY(bytes);
155}
156
157void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158{
159    int32_t splits = 0;
160    int32_t testLength = testString.length();
161    UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162    UErrorCode status = U_ZERO_ERROR;
163    int32_t cpLength = eSplit[0].length();
164    char codepage[64];
165
166    u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167    codepage[cpLength] = '\0';
168
169    LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170
171    int32_t byteLength = 0;
172    char *bytes = extractBytes(testString, codepage, byteLength);
173
174    if (bytes == NULL) {
175#if !UCONFIG_NO_LEGACY_CONVERSION
176        dataerrln("Can't open a " + encoding + " converter for " + id);
177#endif
178        return;
179    }
180
181    ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182
183    int32_t matchCount = 0;
184    const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185
186
187    UnicodeString name(ucsdet_getName(matches[0], &status));
188    UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189    UChar *decoded = NULL;
190    int32_t dLength = 0;
191
192    if (matchCount == 0) {
193        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194        goto bail;
195    }
196
197    if (name.compare(eSplit[0]) != 0) {
198        errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199
200#ifdef DEBUG_DETECT
201        for (int32_t m = 0; m < matchCount; m += 1) {
202            const char *name = ucsdet_getName(matches[m], &status);
203            const char *lang = ucsdet_getLanguage(matches[m], &status);
204            int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205
206            printf("%s (%s) %d\n", name, lang, confidence);
207        }
208#endif
209        goto bail;
210    }
211
212    if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213        errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214        goto bail;
215    }
216
217    decoded = NEW_ARRAY(UChar, testLength);
218    dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219
220    if (testString.compare(decoded, dLength) != 0) {
221        errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222
223#ifdef DEBUG_DETECT
224        for(int32_t i = 0; i < testLength; i += 1) {
225            if(testString[i] != decoded[i]) {
226                printf("Strings differ at byte %d\n", i);
227                break;
228            }
229        }
230#endif
231
232    }
233
234    DELETE_ARRAY(decoded);
235
236bail:
237    freeBytes(bytes);
238    delete[] eSplit;
239}
240
241const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242    UErrorCode status = U_ZERO_ERROR;
243    const char *testDataDirectory = IntlTest::getSourceTestData(status);
244
245    if (U_FAILURE(status)) {
246        errln("ERROR: getPath() failed - %s", u_errorName(status));
247        return NULL;
248    }
249
250    strcpy(buffer, testDataDirectory);
251    strcat(buffer, filename);
252    return buffer;
253}
254
255void CharsetDetectionTest::ConstructionTest()
256{
257    IcuTestErrorCode status(*this, "ConstructionTest");
258    LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259    LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260    int32_t count = uenum_count(e.getAlias(), status);
261
262#ifdef DEBUG_DETECT
263    printf("There are %d recognizers.\n", count);
264#endif
265
266    for(int32_t i = 0; i < count; i += 1) {
267        int32_t length;
268        const char *name = uenum_next(e.getAlias(), &length, status);
269
270        if(name == NULL || length <= 0) {
271            errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272        }
273
274#ifdef DEBUG_DETECT
275        printf("%s\n", name);
276#endif
277    }
278
279    const char* defDisabled[] = {
280        "IBM420_rtl", "IBM420_ltr",
281        "IBM424_rtl", "IBM424_ltr",
282        0
283    };
284
285    LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286    const char *activeName = NULL;
287
288    while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289        // the charset must be included in all list
290        UBool found = FALSE;
291
292        const char *name = NULL;
293        uenum_reset(e.getAlias(), status);
294        while ((name = uenum_next(e.getAlias(), NULL, status))) {
295            if (strcmp(activeName, name) == 0) {
296                found = TRUE;
297                break;
298            }
299        }
300
301        if (!found) {
302            errln(UnicodeString(activeName) + " is not included in the all charset list.");
303        }
304
305        // some charsets are disabled by default
306        found = FALSE;
307        for (int32_t i = 0; defDisabled[i] != 0; i++) {
308            if (strcmp(activeName, defDisabled[i]) == 0) {
309                found = TRUE;
310                break;
311            }
312        }
313        if (found) {
314            errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315        }
316    }
317}
318
319void CharsetDetectionTest::UTF8Test()
320{
321    UErrorCode status = U_ZERO_ERROR;
322    UnicodeString ss = "This is a string with some non-ascii characters that will "
323                       "be converted to UTF-8, then shoved through the detection process.  "
324                       "\\u0391\\u0392\\u0393\\u0394\\u0395"
325                       "Sure would be nice if our source could contain Unicode directly!";
326    UnicodeString s = ss.unescape();
327    int32_t byteLength = 0, sLength = s.length();
328    char *bytes = extractBytes(s, "UTF-8", byteLength);
329    UCharsetDetector *csd = ucsdet_open(&status);
330    const UCharsetMatch *match;
331    UChar *detected = NEW_ARRAY(UChar, sLength);
332
333    ucsdet_setText(csd, bytes, byteLength, &status);
334    match = ucsdet_detect(csd, &status);
335
336    if (match == NULL) {
337        errln("Detection failure for UTF-8: got no matches.");
338        goto bail;
339    }
340
341    ucsdet_getUChars(match, detected, sLength, &status);
342
343    if (s.compare(detected, sLength) != 0) {
344        errln("Round-trip test failed!");
345    }
346
347    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348
349bail:
350    DELETE_ARRAY(detected);
351    freeBytes(bytes);
352    ucsdet_close(csd);
353}
354
355void CharsetDetectionTest::UTF16Test()
356{
357    UErrorCode status = U_ZERO_ERROR;
358    /* Notice the BOM on the start of this string */
359    UChar chars[] = {
360        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364        0x064a, 0x062a, 0x0000};
365    UnicodeString s(chars);
366    int32_t beLength = 0, leLength = 0;
367    char *beBytes = extractBytes(s, "UTF-16BE", beLength);
368    char *leBytes = extractBytes(s, "UTF-16LE", leLength);
369    UCharsetDetector *csd = ucsdet_open(&status);
370    const UCharsetMatch *match;
371    const char *name;
372    int32_t conf;
373
374    ucsdet_setText(csd, beBytes, beLength, &status);
375    match = ucsdet_detect(csd, &status);
376
377    if (match == NULL) {
378        errln("Encoding detection failure for UTF-16BE: got no matches.");
379        goto try_le;
380    }
381
382    name  = ucsdet_getName(match, &status);
383    conf  = ucsdet_getConfidence(match, &status);
384
385    if (strcmp(name, "UTF-16BE") != 0) {
386        errln("Encoding detection failure for UTF-16BE: got %s", name);
387        goto try_le; // no point in looking at confidence if we got the wrong character set.
388    }
389
390    if (conf != 100) {
391        errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
392    }
393
394try_le:
395    ucsdet_setText(csd, leBytes, leLength, &status);
396    match = ucsdet_detect(csd, &status);
397
398    if (match == NULL) {
399        errln("Encoding detection failure for UTF-16LE: got no matches.");
400        goto bail;
401    }
402
403    name  = ucsdet_getName(match, &status);
404    conf = ucsdet_getConfidence(match, &status);
405
406
407    if (strcmp(name, "UTF-16LE") != 0) {
408        errln("Enconding detection failure for UTF-16LE: got %s", name);
409        goto bail; // no point in looking at confidence if we got the wrong character set.
410    }
411
412    if (conf != 100) {
413        errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
414    }
415
416bail:
417    freeBytes(leBytes);
418    freeBytes(beBytes);
419    ucsdet_close(csd);
420}
421
422void CharsetDetectionTest::InputFilterTest()
423{
424    UErrorCode status = U_ZERO_ERROR;
425    UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426    UnicodeString s  = ss.unescape();
427    int32_t byteLength = 0;
428    char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
429    UCharsetDetector *csd = ucsdet_open(&status);
430    const UCharsetMatch *match;
431    const char *lang, *name;
432
433    ucsdet_enableInputFilter(csd, TRUE);
434
435    if (!ucsdet_isInputFilterEnabled(csd)) {
436        errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
437    }
438
439
440    ucsdet_setText(csd, bytes, byteLength, &status);
441    match = ucsdet_detect(csd, &status);
442
443    if (match == NULL) {
444        errln("Turning on the input filter resulted in no matches.");
445        goto turn_off;
446    }
447
448    name = ucsdet_getName(match, &status);
449
450    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
451        errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
452    } else {
453        lang = ucsdet_getLanguage(match, &status);
454
455        if (lang == NULL || strcmp(lang, "fr") != 0) {
456            errln("Input filter did not strip markup!");
457        }
458    }
459
460turn_off:
461    ucsdet_enableInputFilter(csd, FALSE);
462    ucsdet_setText(csd, bytes, byteLength, &status);
463    match = ucsdet_detect(csd, &status);
464
465    if (match == NULL) {
466        errln("Turning off the input filter resulted in no matches.");
467        goto bail;
468    }
469
470    name = ucsdet_getName(match, &status);
471
472    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
473        errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
474    } else {
475        lang = ucsdet_getLanguage(match, &status);
476
477        if (lang == NULL || strcmp(lang, "en") != 0) {
478            errln("Unfiltered input did not detect as English!");
479        }
480    }
481
482bail:
483    freeBytes(bytes);
484    ucsdet_close(csd);
485}
486
487void CharsetDetectionTest::C1BytesTest()
488{
489#if !UCONFIG_NO_LEGACY_CONVERSION
490    UErrorCode status = U_ZERO_ERROR;
491    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
493    UnicodeString sWindows  = ssWindows.unescape();
494    int32_t lISO = 0, lWindows = 0;
495    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
496    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
497    UCharsetDetector *csd = ucsdet_open(&status);
498    const UCharsetMatch *match;
499    const char *name;
500
501    ucsdet_setText(csd, bWindows, lWindows, &status);
502    match = ucsdet_detect(csd, &status);
503
504    if (match == NULL) {
505        errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
506        goto bail;
507    }
508
509    name  = ucsdet_getName(match, &status);
510
511    if (strcmp(name, "windows-1252") != 0) {
512        errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
513    }
514
515    ucsdet_setText(csd, bISO, lISO, &status);
516    match = ucsdet_detect(csd, &status);
517
518    if (match == NULL) {
519        errln("English text without C1 bytes got no matches.");
520        goto bail;
521    }
522
523    name  = ucsdet_getName(match, &status);
524
525    if (strcmp(name, "ISO-8859-1") != 0) {
526        errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
527    }
528
529bail:
530    freeBytes(bWindows);
531    freeBytes(bISO);
532
533    ucsdet_close(csd);
534#endif
535}
536
537void CharsetDetectionTest::DetectionTest()
538{
539#if !UCONFIG_NO_REGULAR_EXPRESSIONS
540    UErrorCode status = U_ZERO_ERROR;
541    char path[2048];
542    const char *testFilePath = getPath(path, "csdetest.xml");
543
544    if (testFilePath == NULL) {
545        return; /* Couldn't get path: error message already output. */
546    }
547
548    UXMLParser  *parser = UXMLParser::createParser(status);
549    if (U_FAILURE(status)) {
550        dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
551        return;
552    }
553
554    UXMLElement *root   = parser->parseFile(testFilePath, status);
555    if (!assertSuccess( "parseFile",status)) return;
556
557    UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
558    UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
559    UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
560
561    const UXMLElement *testCase;
562    int32_t tc = 0;
563
564    while((testCase = root->nextChildElement(tc)) != NULL) {
565        if (testCase->getTagName().compare(test_case) == 0) {
566            const UnicodeString *id = testCase->getAttribute(id_attr);
567            const UnicodeString *encodings = testCase->getAttribute(enc_attr);
568            const UnicodeString  text = testCase->getText(TRUE);
569            int32_t encodingCount;
570            UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
571
572            for(int32_t e = 0; e < encodingCount; e += 1) {
573                checkEncoding(text, encodingList[e], *id);
574            }
575
576            delete[] encodingList;
577        }
578    }
579
580    delete root;
581    delete parser;
582#endif
583}
584
585void CharsetDetectionTest::IBM424Test()
586{
587#if !UCONFIG_ONLY_HTML_CONVERSION
588    UErrorCode status = U_ZERO_ERROR;
589
590    static const UChar chars[] = {
591            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
592            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
593            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
594            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
595            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
596            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
597            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
598            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
599            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
600            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
601            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
602            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
603            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
604            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
605            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
606            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
607            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
608    };
609
610    static const UChar chars_reverse[] = {
611            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
612            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
613            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
614            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
615            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
616            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
617            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
618            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
619            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
620            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
621            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
622            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
623            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
624            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
625            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
626            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
627            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
628            0x0000
629    };
630
631    int32_t bLength = 0, brLength = 0;
632
633    UnicodeString s1(chars);
634    UnicodeString s2(chars_reverse);
635
636    char *bytes = extractBytes(s1, "IBM424", bLength);
637    char *bytes_r = extractBytes(s2, "IBM424", brLength);
638
639    UCharsetDetector *csd = ucsdet_open(&status);
640	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
641	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
642	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
643	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
644    if (U_FAILURE(status)) {
645        errln("Error opening charset detector. - %s", u_errorName(status));
646    }
647    const UCharsetMatch *match;
648    const char *name;
649
650    ucsdet_setText(csd, bytes, bLength, &status);
651    match = ucsdet_detect(csd, &status);
652
653    if (match == NULL) {
654        errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
655        goto bail;
656    }
657
658    name  = ucsdet_getName(match, &status);
659    if (strcmp(name, "IBM424_rtl") != 0) {
660        errln("Encoding detection failure for IBM424_rtl: got %s", name);
661    }
662
663    ucsdet_setText(csd, bytes_r, brLength, &status);
664    match = ucsdet_detect(csd, &status);
665
666    if (match == NULL) {
667        errln("Encoding detection failure for IBM424_ltr: got no matches.");
668        goto bail;
669    }
670
671    name  = ucsdet_getName(match, &status);
672    if (strcmp(name, "IBM424_ltr") != 0) {
673        errln("Encoding detection failure for IBM424_ltr: got %s", name);
674    }
675
676bail:
677    freeBytes(bytes);
678    freeBytes(bytes_r);
679    ucsdet_close(csd);
680#endif
681}
682
683void CharsetDetectionTest::IBM420Test()
684{
685#if !UCONFIG_ONLY_HTML_CONVERSION
686    UErrorCode status = U_ZERO_ERROR;
687
688    static const UChar chars[] = {
689        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
690        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
691        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
692        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
693        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
694        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
695        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
696        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
697        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
698        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
699        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
700        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
701        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
702        0x0000
703    };
704    static const UChar chars_reverse[] = {
705        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
706        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
707        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
708        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
709        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
710        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
711        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
712        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
713        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
714        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
715        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
716        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
717        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
718        0x0000,
719    };
720
721    int32_t bLength = 0, brLength = 0;
722
723    UnicodeString s1(chars);
724    UnicodeString s2(chars_reverse);
725
726    char *bytes = extractBytes(s1, "IBM420", bLength);
727    char *bytes_r = extractBytes(s2, "IBM420", brLength);
728
729    UCharsetDetector *csd = ucsdet_open(&status);
730    if (U_FAILURE(status)) {
731        errln("Error opening charset detector. - %s", u_errorName(status));
732    }
733	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
734	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
735	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
736	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
737    const UCharsetMatch *match;
738    const char *name;
739
740    ucsdet_setText(csd, bytes, bLength, &status);
741    match = ucsdet_detect(csd, &status);
742
743    if (match == NULL) {
744        errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
745        goto bail;
746    }
747
748    name  = ucsdet_getName(match, &status);
749    if (strcmp(name, "IBM420_rtl") != 0) {
750        errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
751    }
752
753    ucsdet_setText(csd, bytes_r, brLength, &status);
754    match = ucsdet_detect(csd, &status);
755
756    if (match == NULL) {
757        errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
758        goto bail;
759    }
760
761    name  = ucsdet_getName(match, &status);
762    if (strcmp(name, "IBM420_ltr") != 0) {
763        errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
764    }
765
766bail:
767    freeBytes(bytes);
768    freeBytes(bytes_r);
769    ucsdet_close(csd);
770#endif
771}
772
773
774void CharsetDetectionTest::Ticket6394Test() {
775#if !UCONFIG_NO_CONVERSION
776    const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
777                             "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
778                             "encodings more than once.  The hop through UnicodeString is for platforms "
779                             "where this char * string is be EBCDIC and needs conversion to Latin1.";
780    char latin1Text[sizeof(charText)];
781    UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
782
783    UErrorCode status = U_ZERO_ERROR;
784    UCharsetDetector *csd = ucsdet_open(&status);
785    ucsdet_setText(csd, latin1Text, -1, &status);
786    if (U_FAILURE(status)) {
787        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
788        return;
789    }
790
791    int32_t matchCount = 0;
792    const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
793    if (U_FAILURE(status)) {
794        errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
795        return;
796    }
797
798    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
799    int32_t i;
800    for (i=0; i<matchCount; i++) {
801        UnicodeString charSetName(ucsdet_getName(matches[i], &status));
802        if (U_FAILURE(status)) {
803            errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
804            status = U_ZERO_ERROR;
805        }
806        if (setOfCharsetNames.contains(charSetName)) {
807            errln("Fail at file %s, line %d ", __FILE__, __LINE__);
808            errln(UnicodeString("   Duplicate charset name = ") + charSetName);
809        }
810        setOfCharsetNames.add(charSetName);
811    }
812    ucsdet_close(csd);
813#endif
814}
815
816
817// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
818//               similar Windows and non-Windows SBCS encodings. State was kept in the shared
819//               Charset Recognizer objects, and could be overwritten.
820void CharsetDetectionTest::Ticket6954Test() {
821#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
822    UErrorCode status = U_ZERO_ERROR;
823    UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
824    UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
825                            "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
826    UnicodeString sWindows  = ssWindows.unescape();
827    int32_t lISO = 0, lWindows = 0;
828    char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
829    char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
830
831    // First do a plain vanilla detect of 1252 text
832
833    UCharsetDetector *csd1 = ucsdet_open(&status);
834    ucsdet_setText(csd1, bWindows, lWindows, &status);
835    const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
836    const char *name1 = ucsdet_getName(match1, &status);
837    TEST_ASSERT_SUCCESS(status);
838    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
839
840    // Next, using a completely separate detector, detect some 8859-1 text
841
842    UCharsetDetector *csd2 = ucsdet_open(&status);
843    ucsdet_setText(csd2, bISO, lISO, &status);
844    const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
845    const char *name2 = ucsdet_getName(match2, &status);
846    TEST_ASSERT_SUCCESS(status);
847    TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
848
849    // Recheck the 1252 results from the first detector, which should not have been
850    //  altered by the use of a different detector.
851
852    name1 = ucsdet_getName(match1, &status);
853    TEST_ASSERT_SUCCESS(status);
854    TEST_ASSERT(strcmp(name1, "windows-1252")==0);
855
856    ucsdet_close(csd1);
857    ucsdet_close(csd2);
858    freeBytes(bISO);
859    freeBytes(bWindows);
860#endif
861}
862