1/*
2 ****************************************************************************
3 * Copyright (c) 2005-2009, International Business Machines Corporation and *
4 * others. All Rights Reserved.                                             *
5 ****************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/ustring.h"
13
14#include "cintltst.h"
15
16#include <stdlib.h>
17#include <string.h>
18
19#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
20
21#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
22#define DELETE_ARRAY(array) free(array)
23
24static void TestConstruction(void);
25static void TestUTF8(void);
26static void TestUTF16(void);
27static void TestC1Bytes(void);
28static void TestInputFilter(void);
29static void TestChaining(void);
30static void TestBufferOverflow(void);
31static void TestIBM424(void);
32static void TestIBM420(void);
33
34void addUCsdetTest(TestNode** root);
35
36void addUCsdetTest(TestNode** root)
37{
38    addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
39    addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
40    addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
41    addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
42    addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
43    addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
44    addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
45#if !UCONFIG_NO_LEGACY_CONVERSION
46    addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
47    addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
48#endif
49}
50
51static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
52{
53    UErrorCode status;
54    char buffer[1024];
55    char *dest, *destLimit = buffer + sizeof(buffer);
56    const UChar *srcLimit = src + length;
57    int32_t result = 0;
58
59    do {
60        dest = buffer;
61        status = U_ZERO_ERROR;
62        ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
63        result += (int32_t) (dest - buffer);
64    } while (status == U_BUFFER_OVERFLOW_ERROR);
65
66    return result;
67}
68
69static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
70{
71    UErrorCode status = U_ZERO_ERROR;
72    UConverter *cnv = ucnv_open(codepage, &status);
73    int32_t byteCount = preflight(src, length, cnv);
74    const UChar *srcLimit = src + length;
75    char *bytes = NEW_ARRAY(char, byteCount + 1);
76    char *dest = bytes, *destLimit = bytes + byteCount + 1;
77
78    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
79    ucnv_close(cnv);
80
81    *byteLength = byteCount;
82    return bytes;
83}
84
85static void freeBytes(char *bytes)
86{
87    DELETE_ARRAY(bytes);
88}
89
90static void TestConstruction(void)
91{
92    UErrorCode status = U_ZERO_ERROR;
93    UCharsetDetector *csd = ucsdet_open(&status);
94    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
95    const char *name;
96    int32_t count = uenum_count(e, &status);
97    int32_t i, length;
98
99    for(i = 0; i < count; i += 1) {
100        name = uenum_next(e, &length, &status);
101
102        if(name == NULL || length <= 0) {
103            log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
104        }
105    }
106    /* one past the list of all names must return NULL */
107    name = uenum_next(e, &length, &status);
108    if(name != NULL || length != 0 || U_FAILURE(status)) {
109        log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
110    }
111
112    uenum_close(e);
113    ucsdet_close(csd);
114}
115
116static void TestUTF8(void)
117{
118    UErrorCode status = U_ZERO_ERROR;
119    static const char ss[] = "This is a string with some non-ascii characters that will "
120               "be converted to UTF-8, then shoved through the detection process.  "
121               "\\u0391\\u0392\\u0393\\u0394\\u0395"
122               "Sure would be nice if our source could contain Unicode directly!";
123    int32_t byteLength = 0, sLength = 0, dLength = 0;
124    UChar s[sizeof(ss)];
125    char *bytes;
126    UCharsetDetector *csd = ucsdet_open(&status);
127    const UCharsetMatch *match;
128    UChar detected[sizeof(ss)];
129
130    sLength = u_unescape(ss, s, sizeof(ss));
131    bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
132
133    ucsdet_setText(csd, bytes, byteLength, &status);
134    if (U_FAILURE(status)) {
135        log_err("status is %s\n", u_errorName(status));
136        goto bail;
137    }
138
139    match = ucsdet_detect(csd, &status);
140
141    if (match == NULL) {
142        log_err("Detection failure for UTF-8: got no matches.\n");
143        goto bail;
144    }
145
146    dLength = ucsdet_getUChars(match, detected, sLength, &status);
147
148    if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
149        log_err("Round-trip test failed!\n");
150    }
151
152    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
153
154bail:
155    freeBytes(bytes);
156    ucsdet_close(csd);
157}
158
159static void TestUTF16(void)
160{
161    UErrorCode status = U_ZERO_ERROR;
162    /* Notice the BOM on the start of this string */
163    static const UChar chars[] = {
164        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
165        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
166        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
167        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
168        0x064a, 0x062a, 0x0000};
169    int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
170    char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
171    char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
172    UCharsetDetector *csd = ucsdet_open(&status);
173    const UCharsetMatch *match;
174    const char *name;
175    int32_t conf;
176
177    ucsdet_setText(csd, beBytes, beLength, &status);
178    match = ucsdet_detect(csd, &status);
179
180    if (match == NULL) {
181        log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
182        goto try_le;
183    }
184
185    name  = ucsdet_getName(match, &status);
186    conf  = ucsdet_getConfidence(match, &status);
187
188    if (strcmp(name, "UTF-16BE") != 0) {
189        log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
190    }
191
192    if (conf != 100) {
193        log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
194    }
195
196try_le:
197    ucsdet_setText(csd, leBytes, leLength, &status);
198    match = ucsdet_detect(csd, &status);
199
200    if (match == NULL) {
201        log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
202        goto bail;
203    }
204
205    name  = ucsdet_getName(match, &status);
206    conf = ucsdet_getConfidence(match, &status);
207
208
209    if (strcmp(name, "UTF-16LE") != 0) {
210        log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
211    }
212
213    if (conf != 100) {
214        log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
215    }
216
217bail:
218    freeBytes(leBytes);
219    freeBytes(beBytes);
220    ucsdet_close(csd);
221}
222
223static void TestC1Bytes(void)
224{
225#if !UCONFIG_NO_LEGACY_CONVERSION
226    UErrorCode status = U_ZERO_ERROR;
227    static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
228    static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
229    int32_t sISOLength = 0, sWindowsLength = 0;
230    UChar sISO[sizeof(ssISO)];
231    UChar sWindows[sizeof(ssWindows)];
232    int32_t lISO = 0, lWindows = 0;
233    char *bISO;
234    char *bWindows;
235    UCharsetDetector *csd = ucsdet_open(&status);
236    const UCharsetMatch *match;
237    const char *name;
238
239    sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
240    sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
241    bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
242    bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
243
244    ucsdet_setText(csd, bWindows, lWindows, &status);
245    match = ucsdet_detect(csd, &status);
246
247    if (match == NULL) {
248        log_err("English test with C1 bytes got no matches.\n");
249        goto bail;
250    }
251
252    name  = ucsdet_getName(match, &status);
253
254    if (strcmp(name, "windows-1252") != 0) {
255        log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
256    }
257
258    ucsdet_setText(csd, bISO, lISO, &status);
259    match = ucsdet_detect(csd, &status);
260
261    if (match == NULL) {
262        log_err("English text without C1 bytes got no matches.\n");
263        goto bail;
264    }
265
266    name  = ucsdet_getName(match, &status);
267
268    if (strcmp(name, "ISO-8859-1") != 0) {
269        log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
270    }
271
272bail:
273    freeBytes(bWindows);
274    freeBytes(bISO);
275
276    ucsdet_close(csd);
277#endif
278}
279
280static void TestInputFilter(void)
281{
282    UErrorCode status = U_ZERO_ERROR;
283    static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
284    int32_t sLength = 0;
285    UChar s[sizeof(ss)];
286    int32_t byteLength = 0;
287    char *bytes;
288    UCharsetDetector *csd = ucsdet_open(&status);
289    const UCharsetMatch *match;
290    const char *lang, *name;
291
292    sLength = u_unescape(ss, s, sizeof(ss));
293    bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
294
295    ucsdet_enableInputFilter(csd, TRUE);
296
297    if (!ucsdet_isInputFilterEnabled(csd)) {
298        log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
299    }
300
301
302    ucsdet_setText(csd, bytes, byteLength, &status);
303    match = ucsdet_detect(csd, &status);
304
305    if (match == NULL) {
306        log_err("Turning on the input filter resulted in no matches.\n");
307        goto turn_off;
308    }
309
310    name = ucsdet_getName(match, &status);
311
312    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
313        log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
314    } else {
315        lang = ucsdet_getLanguage(match, &status);
316
317        if (lang == NULL || strcmp(lang, "fr") != 0) {
318            log_err("Input filter did not strip markup!\n");
319        }
320    }
321
322turn_off:
323    ucsdet_enableInputFilter(csd, FALSE);
324    ucsdet_setText(csd, bytes, byteLength, &status);
325    match = ucsdet_detect(csd, &status);
326
327    if (match == NULL) {
328        log_err("Turning off the input filter resulted in no matches.\n");
329        goto bail;
330    }
331
332    name = ucsdet_getName(match, &status);
333
334    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
335        log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
336    } else {
337        lang = ucsdet_getLanguage(match, &status);
338
339        if (lang == NULL || strcmp(lang, "en") != 0) {
340            log_err("Unfiltered input did not detect as English!\n");
341        }
342    }
343
344bail:
345    freeBytes(bytes);
346    ucsdet_close(csd);
347}
348
349static void TestChaining(void) {
350    UErrorCode status = U_USELESS_COLLATOR_ERROR;
351
352    ucsdet_open(&status);
353    ucsdet_setText(NULL, NULL, 0, &status);
354    ucsdet_getName(NULL, &status);
355    ucsdet_getConfidence(NULL, &status);
356    ucsdet_getLanguage(NULL, &status);
357    ucsdet_detect(NULL, &status);
358    ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
359    ucsdet_detectAll(NULL, NULL, &status);
360    ucsdet_getUChars(NULL, NULL, 0, &status);
361    ucsdet_getUChars(NULL, NULL, 0, &status);
362    ucsdet_close(NULL);
363
364    /* All of this code should have done nothing. */
365    if (status != U_USELESS_COLLATOR_ERROR) {
366        log_err("Status got changed to %s\n", u_errorName(status));
367    }
368}
369
370static void TestBufferOverflow(void) {
371    UErrorCode status = U_ZERO_ERROR;
372    static const char *testStrings[] = {
373        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
374        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
375        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
376        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
377        "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
378        "\xa1", /* Could be a single byte shift-jis at the end */
379        "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
380        "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
381    };
382    static const char *testResults[] = {
383        "windows-1252",
384        "windows-1252",
385        "windows-1252",
386        "windows-1252",
387        "ISO-2022-JP",
388        NULL,
389        NULL,
390        "ISO-8859-1"
391    };
392    int32_t idx = 0;
393    UCharsetDetector *csd = ucsdet_open(&status);
394    const UCharsetMatch *match;
395
396    ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
397
398    if (U_FAILURE(status)) {
399        log_err("Couldn't open detector. %s\n", u_errorName(status));
400        goto bail;
401    }
402
403    for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
404        ucsdet_setText(csd, testStrings[idx], -1, &status);
405        match = ucsdet_detect(csd, &status);
406
407        if (match == NULL) {
408            if (testResults[idx] != NULL) {
409                log_err("Unexpectedly got no results at index %d.\n", idx);
410            }
411            else {
412                log_verbose("Got no result as expected at index %d.\n", idx);
413            }
414            continue;
415        }
416
417        if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
418            log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
419                ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
420            goto bail;
421        }
422    }
423
424bail:
425    ucsdet_close(csd);
426}
427
428static void TestIBM424(void)
429{
430    UErrorCode status = U_ZERO_ERROR;
431
432    static const UChar chars[] = {
433            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
434            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
435            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
436            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
437            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
438            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
439            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
440            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
441            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
442            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
443            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
444            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
445            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
446            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
447            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
448            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
449            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
450    };
451
452    static const UChar chars_reverse[] = {
453            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
454            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
455            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
456            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
457            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
458            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
459            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
460            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
461            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
462            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
463            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
464            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
465            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
466            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
467            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
468            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
469            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
470            0x0000
471    };
472
473    int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
474
475    char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
476    char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
477
478    UCharsetDetector *csd = ucsdet_open(&status);
479    const UCharsetMatch *match;
480    const char *name;
481
482    ucsdet_setText(csd, bytes, bLength, &status);
483    match = ucsdet_detect(csd, &status);
484
485    if (match == NULL) {
486        log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
487        goto bail;
488    }
489
490    name  = ucsdet_getName(match, &status);
491    if (strcmp(name, "IBM424_rtl") != 0) {
492        log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
493    }
494
495    ucsdet_setText(csd, bytes_r, brLength, &status);
496    match = ucsdet_detect(csd, &status);
497
498    if (match == NULL) {
499        log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
500        goto bail;
501    }
502
503    name  = ucsdet_getName(match, &status);
504    if (strcmp(name, "IBM424_ltr") != 0) {
505        log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
506    }
507
508bail:
509    freeBytes(bytes);
510    freeBytes(bytes_r);
511    ucsdet_close(csd);
512}
513
514static void TestIBM420(void)
515{
516    UErrorCode status = U_ZERO_ERROR;
517
518    static const UChar chars[] = {
519        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
520        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
521        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
522        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
523        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
524        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
525        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
526        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
527        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
528        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
529        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
530        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
531        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
532        0x0000
533    };
534    static const UChar chars_reverse[] = {
535        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
536        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
537        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
538        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
539        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
540        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
541        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
542        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
543        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
544        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
545        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
546        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
547        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
548        0x0000,
549    };
550
551    int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
552
553    char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
554    char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
555
556    UCharsetDetector *csd = ucsdet_open(&status);
557    const UCharsetMatch *match;
558    const char *name;
559
560    ucsdet_setText(csd, bytes, bLength, &status);
561    match = ucsdet_detect(csd, &status);
562
563    if (match == NULL) {
564        log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
565        goto bail;
566    }
567
568    name  = ucsdet_getName(match, &status);
569    if (strcmp(name, "IBM420_rtl") != 0) {
570        log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
571    }
572
573    ucsdet_setText(csd, bytes_r, brLength, &status);
574    match = ucsdet_detect(csd, &status);
575
576    if (match == NULL) {
577        log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
578        goto bail;
579    }
580
581    name  = ucsdet_getName(match, &status);
582    if (strcmp(name, "IBM420_ltr") != 0) {
583        log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
584    }
585
586bail:
587    freeBytes(bytes);
588    freeBytes(bytes_r);
589    ucsdet_close(csd);
590}
591