1/*
2 ****************************************************************************
3 * Copyright (c) 2005-2009, International Business Machines Corporation and *
4 * others. All Rights Reserved.                                             *
5 ****************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#include "unicode/ucsdet.h"
11#include "unicode/ucnv.h"
12#include "unicode/ustring.h"
13
14#include "cintltst.h"
15
16#include <stdlib.h>
17#include <string.h>
18
19#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
20
21#define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
22#define DELETE_ARRAY(array) free(array)
23
24static void TestConstruction(void);
25static void TestUTF8(void);
26static void TestUTF16(void);
27static void TestC1Bytes(void);
28static void TestInputFilter(void);
29static void TestChaining(void);
30static void TestBufferOverflow(void);
31static void TestIBM424(void);
32static void TestIBM420(void);
33
34void addUCsdetTest(TestNode** root);
35
36void addUCsdetTest(TestNode** root)
37{
38    addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
39    addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
40    addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
41    addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
42    addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
43    addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
44    addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
45    addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
46    addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
47}
48
49static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
50{
51    UErrorCode status;
52    char buffer[1024];
53    char *dest, *destLimit = buffer + sizeof(buffer);
54    const UChar *srcLimit = src + length;
55    int32_t result = 0;
56
57    do {
58        dest = buffer;
59        status = U_ZERO_ERROR;
60        ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
61        result += (int32_t) (dest - buffer);
62    } while (status == U_BUFFER_OVERFLOW_ERROR);
63
64    return result;
65}
66
67static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
68{
69    UErrorCode status = U_ZERO_ERROR;
70    UConverter *cnv = ucnv_open(codepage, &status);
71    int32_t byteCount = preflight(src, length, cnv);
72    const UChar *srcLimit = src + length;
73    char *bytes = NEW_ARRAY(char, byteCount + 1);
74    char *dest = bytes, *destLimit = bytes + byteCount + 1;
75
76    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
77    ucnv_close(cnv);
78
79    *byteLength = byteCount;
80    return bytes;
81}
82
83static void freeBytes(char *bytes)
84{
85    DELETE_ARRAY(bytes);
86}
87
88static void TestConstruction(void)
89{
90    UErrorCode status = U_ZERO_ERROR;
91    UCharsetDetector *csd = ucsdet_open(&status);
92    UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
93    const char *name;
94    int32_t count = uenum_count(e, &status);
95    int32_t i, length;
96
97    for(i = 0; i < count; i += 1) {
98        name = uenum_next(e, &length, &status);
99
100        if(name == NULL || length <= 0) {
101            log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
102        }
103    }
104    /* one past the list of all names must return NULL */
105    name = uenum_next(e, &length, &status);
106    if(name != NULL || length != 0 || U_FAILURE(status)) {
107        log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
108    }
109
110    uenum_close(e);
111    ucsdet_close(csd);
112}
113
114static void TestUTF8(void)
115{
116    UErrorCode status = U_ZERO_ERROR;
117    static const char ss[] = "This is a string with some non-ascii characters that will "
118               "be converted to UTF-8, then shoved through the detection process.  "
119               "\\u0391\\u0392\\u0393\\u0394\\u0395"
120               "Sure would be nice if our source could contain Unicode directly!";
121    int32_t byteLength = 0, sLength = 0, dLength = 0;
122    UChar s[sizeof(ss)];
123    char *bytes;
124    UCharsetDetector *csd = ucsdet_open(&status);
125    const UCharsetMatch *match;
126    UChar detected[sizeof(ss)];
127
128    sLength = u_unescape(ss, s, sizeof(ss));
129    bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
130
131    ucsdet_setText(csd, bytes, byteLength, &status);
132    if (U_FAILURE(status)) {
133        log_err("status is %s\n", u_errorName(status));
134        goto bail;
135    }
136
137    match = ucsdet_detect(csd, &status);
138
139    if (match == NULL) {
140        log_err("Detection failure for UTF-8: got no matches.\n");
141        goto bail;
142    }
143
144    dLength = ucsdet_getUChars(match, detected, sLength, &status);
145
146    if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
147        log_err("Round-trip test failed!\n");
148    }
149
150    ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
151
152bail:
153    freeBytes(bytes);
154    ucsdet_close(csd);
155}
156
157static void TestUTF16(void)
158{
159    UErrorCode status = U_ZERO_ERROR;
160    /* Notice the BOM on the start of this string */
161    static const UChar chars[] = {
162        0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
163        0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
164        0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
165        0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
166        0x064a, 0x062a, 0x0000};
167    int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
168    char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
169    char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
170    UCharsetDetector *csd = ucsdet_open(&status);
171    const UCharsetMatch *match;
172    const char *name;
173    int32_t conf;
174
175    ucsdet_setText(csd, beBytes, beLength, &status);
176    match = ucsdet_detect(csd, &status);
177
178    if (match == NULL) {
179        log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
180        goto try_le;
181    }
182
183    name  = ucsdet_getName(match, &status);
184    conf  = ucsdet_getConfidence(match, &status);
185
186    if (strcmp(name, "UTF-16BE") != 0) {
187        log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
188    }
189
190    if (conf != 100) {
191        log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
192    }
193
194try_le:
195    ucsdet_setText(csd, leBytes, leLength, &status);
196    match = ucsdet_detect(csd, &status);
197
198    if (match == NULL) {
199        log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
200        goto bail;
201    }
202
203    name  = ucsdet_getName(match, &status);
204    conf = ucsdet_getConfidence(match, &status);
205
206
207    if (strcmp(name, "UTF-16LE") != 0) {
208        log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
209    }
210
211    if (conf != 100) {
212        log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
213    }
214
215bail:
216    freeBytes(leBytes);
217    freeBytes(beBytes);
218    ucsdet_close(csd);
219}
220
221static void TestC1Bytes(void)
222{
223#if !UCONFIG_NO_LEGACY_CONVERSION
224    UErrorCode status = U_ZERO_ERROR;
225    static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
226    static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
227    int32_t sISOLength = 0, sWindowsLength = 0;
228    UChar sISO[sizeof(ssISO)];
229    UChar sWindows[sizeof(ssWindows)];
230    int32_t lISO = 0, lWindows = 0;
231    char *bISO;
232    char *bWindows;
233    UCharsetDetector *csd = ucsdet_open(&status);
234    const UCharsetMatch *match;
235    const char *name;
236
237    sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
238    sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
239    bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
240    bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
241
242    ucsdet_setText(csd, bWindows, lWindows, &status);
243    match = ucsdet_detect(csd, &status);
244
245    if (match == NULL) {
246        log_err("English test with C1 bytes got no matches.\n");
247        goto bail;
248    }
249
250    name  = ucsdet_getName(match, &status);
251
252    if (strcmp(name, "windows-1252") != 0) {
253        log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
254    }
255
256    ucsdet_setText(csd, bISO, lISO, &status);
257    match = ucsdet_detect(csd, &status);
258
259    if (match == NULL) {
260        log_err("English text without C1 bytes got no matches.\n");
261        goto bail;
262    }
263
264    name  = ucsdet_getName(match, &status);
265
266    if (strcmp(name, "ISO-8859-1") != 0) {
267        log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
268    }
269
270bail:
271    freeBytes(bWindows);
272    freeBytes(bISO);
273
274    ucsdet_close(csd);
275#endif
276}
277
278static void TestInputFilter(void)
279{
280    UErrorCode status = U_ZERO_ERROR;
281    static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
282    int32_t sLength = 0;
283    UChar s[sizeof(ss)];
284    int32_t byteLength = 0;
285    char *bytes;
286    UCharsetDetector *csd = ucsdet_open(&status);
287    const UCharsetMatch *match;
288    const char *lang, *name;
289
290    sLength = u_unescape(ss, s, sizeof(ss));
291    bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
292
293    ucsdet_enableInputFilter(csd, TRUE);
294
295    if (!ucsdet_isInputFilterEnabled(csd)) {
296        log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
297    }
298
299
300    ucsdet_setText(csd, bytes, byteLength, &status);
301    match = ucsdet_detect(csd, &status);
302
303    if (match == NULL) {
304        log_err("Turning on the input filter resulted in no matches.\n");
305        goto turn_off;
306    }
307
308    name = ucsdet_getName(match, &status);
309
310    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
311        log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
312    } else {
313        lang = ucsdet_getLanguage(match, &status);
314
315        if (lang == NULL || strcmp(lang, "fr") != 0) {
316            log_err("Input filter did not strip markup!\n");
317        }
318    }
319
320turn_off:
321    ucsdet_enableInputFilter(csd, FALSE);
322    ucsdet_setText(csd, bytes, byteLength, &status);
323    match = ucsdet_detect(csd, &status);
324
325    if (match == NULL) {
326        log_err("Turning off the input filter resulted in no matches.\n");
327        goto bail;
328    }
329
330    name = ucsdet_getName(match, &status);
331
332    if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
333        log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
334    } else {
335        lang = ucsdet_getLanguage(match, &status);
336
337        if (lang == NULL || strcmp(lang, "en") != 0) {
338            log_err("Unfiltered input did not detect as English!\n");
339        }
340    }
341
342bail:
343    freeBytes(bytes);
344    ucsdet_close(csd);
345}
346
347static void TestChaining(void) {
348    UErrorCode status = U_USELESS_COLLATOR_ERROR;
349
350    ucsdet_open(&status);
351    ucsdet_setText(NULL, NULL, 0, &status);
352    ucsdet_getName(NULL, &status);
353    ucsdet_getConfidence(NULL, &status);
354    ucsdet_getLanguage(NULL, &status);
355    ucsdet_detect(NULL, &status);
356    ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
357    ucsdet_detectAll(NULL, NULL, &status);
358    ucsdet_getUChars(NULL, NULL, 0, &status);
359    ucsdet_getUChars(NULL, NULL, 0, &status);
360    ucsdet_close(NULL);
361
362    /* All of this code should have done nothing. */
363    if (status != U_USELESS_COLLATOR_ERROR) {
364        log_err("Status got changed to %s\n", u_errorName(status));
365    }
366}
367
368static void TestBufferOverflow(void) {
369    UErrorCode status = U_ZERO_ERROR;
370    static const char *testStrings[] = {
371        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
372        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
373        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
374        "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
375        "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
376        "\xa1", /* Could be a single byte shift-jis at the end */
377        "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
378        "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
379    };
380    static const char *testResults[] = {
381        "windows-1252",
382        "windows-1252",
383        "windows-1252",
384        "windows-1252",
385        "ISO-2022-JP",
386        NULL,
387        NULL,
388        "ISO-8859-1"
389    };
390    int32_t idx = 0;
391    UCharsetDetector *csd = ucsdet_open(&status);
392    const UCharsetMatch *match;
393
394    ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
395
396    if (U_FAILURE(status)) {
397        log_err("Couldn't open detector. %s\n", u_errorName(status));
398        goto bail;
399    }
400
401    for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
402        ucsdet_setText(csd, testStrings[idx], -1, &status);
403        match = ucsdet_detect(csd, &status);
404
405        if (match == NULL) {
406            if (testResults[idx] != NULL) {
407                log_err("Unexpectedly got no results at index %d.\n", idx);
408            }
409            else {
410                log_verbose("Got no result as expected at index %d.\n", idx);
411            }
412            continue;
413        }
414
415        if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
416            log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
417                ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
418            goto bail;
419        }
420    }
421
422bail:
423    ucsdet_close(csd);
424}
425
426static void TestIBM424(void)
427{
428    UErrorCode status = U_ZERO_ERROR;
429
430    static const UChar chars[] = {
431            0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
432            0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
433            0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
434            0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
435            0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
436            0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
437            0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
438            0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
439            0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
440            0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
441            0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
442            0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
443            0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
444            0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
445            0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
446            0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
447            0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
448    };
449
450    static const UChar chars_reverse[] = {
451            0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
452            0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
453            0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
454            0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
455            0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
456            0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
457            0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
458            0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
459            0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
460            0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
461            0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
462            0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
463            0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
464            0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
465            0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
466            0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
467            0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
468            0x0000
469    };
470
471    int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
472
473    char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
474    char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
475
476    UCharsetDetector *csd = ucsdet_open(&status);
477    const UCharsetMatch *match;
478    const char *name;
479
480    ucsdet_setText(csd, bytes, bLength, &status);
481    match = ucsdet_detect(csd, &status);
482
483    if (match == NULL) {
484        log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
485        goto bail;
486    }
487
488    name  = ucsdet_getName(match, &status);
489    if (strcmp(name, "IBM424_rtl") != 0) {
490        log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
491    }
492
493    ucsdet_setText(csd, bytes_r, brLength, &status);
494    match = ucsdet_detect(csd, &status);
495
496    if (match == NULL) {
497        log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
498        goto bail;
499    }
500
501    name  = ucsdet_getName(match, &status);
502    if (strcmp(name, "IBM424_ltr") != 0) {
503        log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
504    }
505
506bail:
507    freeBytes(bytes);
508    freeBytes(bytes_r);
509    ucsdet_close(csd);
510}
511
512static void TestIBM420(void)
513{
514    UErrorCode status = U_ZERO_ERROR;
515
516    static const UChar chars[] = {
517        0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
518        0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
519        0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
520        0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
521        0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
522        0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
523        0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
524        0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
525        0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
526        0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
527        0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
528        0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
529        0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
530        0x0000
531    };
532    static const UChar chars_reverse[] = {
533        0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
534        0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
535        0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
536        0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
537        0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
538        0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
539        0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
540        0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
541        0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
542        0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
543        0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
544        0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
545        0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
546        0x0000,
547    };
548
549    int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
550
551    char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
552    char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
553
554    UCharsetDetector *csd = ucsdet_open(&status);
555    const UCharsetMatch *match;
556    const char *name;
557
558    ucsdet_setText(csd, bytes, bLength, &status);
559    match = ucsdet_detect(csd, &status);
560
561    if (match == NULL) {
562        log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
563        goto bail;
564    }
565
566    name  = ucsdet_getName(match, &status);
567    if (strcmp(name, "IBM420_rtl") != 0) {
568        log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
569    }
570
571    ucsdet_setText(csd, bytes_r, brLength, &status);
572    match = ucsdet_detect(csd, &status);
573
574    if (match == NULL) {
575        log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
576        goto bail;
577    }
578
579    name  = ucsdet_getName(match, &status);
580    if (strcmp(name, "IBM420_ltr") != 0) {
581        log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
582    }
583
584bail:
585    freeBytes(bytes);
586    freeBytes(bytes_r);
587    ucsdet_close(csd);
588}
589