1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2014, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  convtest.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003jul15
14*   created by: Markus W. Scherer
15*
16*   Test file for data-driven conversion tests.
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_LEGACY_CONVERSION
22/*
23 * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
24 * is slightly unnecessary - it removes tests for Unicode charsets
25 * like UTF-8 that should work.
26 * However, there is no easy way for the test to detect whether a test case
27 * is for a Unicode charset, so it would be difficult to only exclude those.
28 * Also, regular testing of ICU is done with all modules on, therefore
29 * not testing conversion for a custom configuration like this should be ok.
30 */
31
32#include "unicode/ucnv.h"
33#include "unicode/unistr.h"
34#include "unicode/parsepos.h"
35#include "unicode/uniset.h"
36#include "unicode/ustring.h"
37#include "unicode/ures.h"
38#include "convtest.h"
39#include "cmemory.h"
40#include "unicode/tstdtmod.h"
41#include <string.h>
42#include <stdlib.h>
43
44enum {
45    // characters used in test data for callbacks
46    SUB_CB='?',
47    SKIP_CB='0',
48    STOP_CB='.',
49    ESC_CB='&'
50};
51
52ConversionTest::ConversionTest() {
53    UErrorCode errorCode=U_ZERO_ERROR;
54    utf8Cnv=ucnv_open("UTF-8", &errorCode);
55    ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
56    if(U_FAILURE(errorCode)) {
57        errln("unable to open UTF-8 converter");
58    }
59}
60
61ConversionTest::~ConversionTest() {
62    ucnv_close(utf8Cnv);
63}
64
65void
66ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
67    if (exec) logln("TestSuite ConversionTest: ");
68    switch (index) {
69#if !UCONFIG_NO_FILE_IO
70        case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
71        case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
72        case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
73        case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break;
74#else
75        case 0:
76        case 1:
77        case 2:
78        case 3: name="skip"; break;
79#endif
80        case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
81        default: name=""; break; //needed to end loop
82    }
83}
84
85// test data interface ----------------------------------------------------- ***
86
87void
88ConversionTest::TestToUnicode() {
89    ConversionCase cc;
90    char charset[100], cbopt[4];
91    const char *option;
92    UnicodeString s, unicode;
93    int32_t offsetsLength;
94    UConverterToUCallback callback;
95
96    TestDataModule *dataModule;
97    TestData *testData;
98    const DataMap *testCase;
99    UErrorCode errorCode;
100    int32_t i;
101
102    errorCode=U_ZERO_ERROR;
103    dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
104    if(U_SUCCESS(errorCode)) {
105        testData=dataModule->createTestData("toUnicode", errorCode);
106        if(U_SUCCESS(errorCode)) {
107            for(i=0; testData->nextCase(testCase, errorCode); ++i) {
108                if(U_FAILURE(errorCode)) {
109                    errln("error retrieving conversion/toUnicode test case %d - %s",
110                            i, u_errorName(errorCode));
111                    errorCode=U_ZERO_ERROR;
112                    continue;
113                }
114
115                cc.caseNr=i;
116
117                s=testCase->getString("charset", errorCode);
118                s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
119                cc.charset=charset;
120
121                // BEGIN android-added
122                // To save space, Android does not build full ISO-2022-CN tables.
123                // We skip the TestGetKeywordValuesForLocale for counting available collations.
124                if (strlen(charset) >= 8 &&
125                    strncmp(charset+4, "2022-CN", 4) == 0) {
126                    continue;
127                }
128                // END android-added
129
130                cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
131                unicode=testCase->getString("unicode", errorCode);
132                cc.unicode=unicode.getBuffer();
133                cc.unicodeLength=unicode.length();
134
135                offsetsLength=0;
136                cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
137                if(offsetsLength==0) {
138                    cc.offsets=NULL;
139                } else if(offsetsLength!=unicode.length()) {
140                    errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
141                            i, unicode.length(), offsetsLength);
142                    errorCode=U_ILLEGAL_ARGUMENT_ERROR;
143                }
144
145                cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
146                cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
147
148                s=testCase->getString("errorCode", errorCode);
149                if(s==UNICODE_STRING("invalid", 7)) {
150                    cc.outErrorCode=U_INVALID_CHAR_FOUND;
151                } else if(s==UNICODE_STRING("illegal", 7)) {
152                    cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
153                } else if(s==UNICODE_STRING("truncated", 9)) {
154                    cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
155                } else if(s==UNICODE_STRING("illesc", 6)) {
156                    cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
157                } else if(s==UNICODE_STRING("unsuppesc", 9)) {
158                    cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
159                } else {
160                    cc.outErrorCode=U_ZERO_ERROR;
161                }
162
163                s=testCase->getString("callback", errorCode);
164                s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
165                cc.cbopt=cbopt;
166                switch(cbopt[0]) {
167                case SUB_CB:
168                    callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
169                    break;
170                case SKIP_CB:
171                    callback=UCNV_TO_U_CALLBACK_SKIP;
172                    break;
173                case STOP_CB:
174                    callback=UCNV_TO_U_CALLBACK_STOP;
175                    break;
176                case ESC_CB:
177                    callback=UCNV_TO_U_CALLBACK_ESCAPE;
178                    break;
179                default:
180                    callback=NULL;
181                    break;
182                }
183                option=callback==NULL ? cbopt : cbopt+1;
184                if(*option==0) {
185                    option=NULL;
186                }
187
188                cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
189
190                if(U_FAILURE(errorCode)) {
191                    errln("error parsing conversion/toUnicode test case %d - %s",
192                            i, u_errorName(errorCode));
193                    errorCode=U_ZERO_ERROR;
194                } else {
195                    logln("TestToUnicode[%d] %s", i, charset);
196                    ToUnicodeCase(cc, callback, option);
197                }
198            }
199            delete testData;
200        }
201        delete dataModule;
202    }
203    else {
204        dataerrln("Could not load test conversion data");
205    }
206}
207
208void
209ConversionTest::TestFromUnicode() {
210    ConversionCase cc;
211    char charset[100], cbopt[4];
212    const char *option;
213    UnicodeString s, unicode, invalidUChars;
214    int32_t offsetsLength, index;
215    UConverterFromUCallback callback;
216
217    TestDataModule *dataModule;
218    TestData *testData;
219    const DataMap *testCase;
220    const UChar *p;
221    UErrorCode errorCode;
222    int32_t i, length;
223
224    errorCode=U_ZERO_ERROR;
225    dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
226    if(U_SUCCESS(errorCode)) {
227        testData=dataModule->createTestData("fromUnicode", errorCode);
228        if(U_SUCCESS(errorCode)) {
229            for(i=0; testData->nextCase(testCase, errorCode); ++i) {
230                if(U_FAILURE(errorCode)) {
231                    errln("error retrieving conversion/fromUnicode test case %d - %s",
232                            i, u_errorName(errorCode));
233                    errorCode=U_ZERO_ERROR;
234                    continue;
235                }
236
237                cc.caseNr=i;
238
239                s=testCase->getString("charset", errorCode);
240                s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
241                cc.charset=charset;
242
243                // BEGIN android-added
244                // To save space, Android does not build full ISO-2022-CN tables.
245                // We skip the TestGetKeywordValuesForLocale for counting available collations.
246                if (strlen(charset) >= 8 &&
247                    strncmp(charset+4, "2022-CN", 4) == 0) {
248                    continue;
249                }
250                // END android-added
251
252                unicode=testCase->getString("unicode", errorCode);
253                cc.unicode=unicode.getBuffer();
254                cc.unicodeLength=unicode.length();
255                cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
256
257                offsetsLength=0;
258                cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
259                if(offsetsLength==0) {
260                    cc.offsets=NULL;
261                } else if(offsetsLength!=cc.bytesLength) {
262                    errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
263                            i, cc.bytesLength, offsetsLength);
264                    errorCode=U_ILLEGAL_ARGUMENT_ERROR;
265                }
266
267                cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
268                cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
269
270                s=testCase->getString("errorCode", errorCode);
271                if(s==UNICODE_STRING("invalid", 7)) {
272                    cc.outErrorCode=U_INVALID_CHAR_FOUND;
273                } else if(s==UNICODE_STRING("illegal", 7)) {
274                    cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
275                } else if(s==UNICODE_STRING("truncated", 9)) {
276                    cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
277                } else {
278                    cc.outErrorCode=U_ZERO_ERROR;
279                }
280
281                s=testCase->getString("callback", errorCode);
282                cc.setSub=0; // default: no subchar
283
284                if((index=s.indexOf((UChar)0))>0) {
285                    // read NUL-separated subchar first, if any
286                    // copy the subchar from Latin-1 characters
287                    // start after the NUL
288                    p=s.getTerminatedBuffer();
289                    length=index+1;
290                    p+=length;
291                    length=s.length()-length;
292                    if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
293                        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
294                    } else {
295                        int32_t j;
296
297                        for(j=0; j<length; ++j) {
298                            cc.subchar[j]=(char)p[j];
299                        }
300                        // NUL-terminate the subchar
301                        cc.subchar[j]=0;
302                        cc.setSub=1;
303                    }
304
305                    // remove the NUL and subchar from s
306                    s.truncate(index);
307                } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
308                    // read a substitution string, separated by an equal sign
309                    p=s.getBuffer()+index+1;
310                    length=s.length()-(index+1);
311                    if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) {
312                        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
313                    } else {
314                        u_memcpy(cc.subString, p, length);
315                        // NUL-terminate the subString
316                        cc.subString[length]=0;
317                        cc.setSub=-1;
318                    }
319
320                    // remove the equal sign and subString from s
321                    s.truncate(index);
322                }
323
324                s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
325                cc.cbopt=cbopt;
326                switch(cbopt[0]) {
327                case SUB_CB:
328                    callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
329                    break;
330                case SKIP_CB:
331                    callback=UCNV_FROM_U_CALLBACK_SKIP;
332                    break;
333                case STOP_CB:
334                    callback=UCNV_FROM_U_CALLBACK_STOP;
335                    break;
336                case ESC_CB:
337                    callback=UCNV_FROM_U_CALLBACK_ESCAPE;
338                    break;
339                default:
340                    callback=NULL;
341                    break;
342                }
343                option=callback==NULL ? cbopt : cbopt+1;
344                if(*option==0) {
345                    option=NULL;
346                }
347
348                invalidUChars=testCase->getString("invalidUChars", errorCode);
349                cc.invalidUChars=invalidUChars.getBuffer();
350                cc.invalidLength=invalidUChars.length();
351
352                if(U_FAILURE(errorCode)) {
353                    errln("error parsing conversion/fromUnicode test case %d - %s",
354                            i, u_errorName(errorCode));
355                    errorCode=U_ZERO_ERROR;
356                } else {
357                    logln("TestFromUnicode[%d] %s", i, charset);
358                    FromUnicodeCase(cc, callback, option);
359                }
360            }
361            delete testData;
362        }
363        delete dataModule;
364    }
365    else {
366        dataerrln("Could not load test conversion data");
367    }
368}
369
370static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
371
372void
373ConversionTest::TestGetUnicodeSet() {
374    char charset[100];
375    UnicodeString s, map, mapnot;
376    int32_t which;
377
378    ParsePosition pos;
379    UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
380    UnicodeSet *cnvSetPtr = &cnvSet;
381    LocalUConverterPointer cnv;
382
383    TestDataModule *dataModule;
384    TestData *testData;
385    const DataMap *testCase;
386    UErrorCode errorCode;
387    int32_t i;
388
389    errorCode=U_ZERO_ERROR;
390    dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
391    if(U_SUCCESS(errorCode)) {
392        testData=dataModule->createTestData("getUnicodeSet", errorCode);
393        if(U_SUCCESS(errorCode)) {
394            for(i=0; testData->nextCase(testCase, errorCode); ++i) {
395                if(U_FAILURE(errorCode)) {
396                    errln("error retrieving conversion/getUnicodeSet test case %d - %s",
397                            i, u_errorName(errorCode));
398                    errorCode=U_ZERO_ERROR;
399                    continue;
400                }
401
402                s=testCase->getString("charset", errorCode);
403                s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
404
405                // BEGIN android-added
406                // To save space, Android does not build full ISO-2022-CN tables.
407                // We skip the TestGetKeywordValuesForLocale for counting available collations.
408                if (strlen(charset) >= 8 &&
409                    strncmp(charset+4, "2022-CN", 4) == 0) {
410                    continue;
411                }
412                // END android-added
413
414                map=testCase->getString("map", errorCode);
415                mapnot=testCase->getString("mapnot", errorCode);
416
417                which=testCase->getInt28("which", errorCode);
418
419                if(U_FAILURE(errorCode)) {
420                    errln("error parsing conversion/getUnicodeSet test case %d - %s",
421                            i, u_errorName(errorCode));
422                    errorCode=U_ZERO_ERROR;
423                    continue;
424                }
425
426                // test this test case
427                mapSet.clear();
428                mapnotSet.clear();
429
430                pos.setIndex(0);
431                mapSet.applyPattern(map, pos, 0, NULL, errorCode);
432                if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
433                    errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
434                          "    error index %d  index %d  U+%04x",
435                            i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
436                    errorCode=U_ZERO_ERROR;
437                    continue;
438                }
439
440                pos.setIndex(0);
441                mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
442                if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
443                    errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
444                          "    error index %d  index %d  U+%04x",
445                            i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
446                    errorCode=U_ZERO_ERROR;
447                    continue;
448                }
449
450                logln("TestGetUnicodeSet[%d] %s", i, charset);
451
452                cnv.adoptInstead(cnv_open(charset, errorCode));
453                if(U_FAILURE(errorCode)) {
454                    errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
455                            charset, i, u_errorName(errorCode));
456                    errorCode=U_ZERO_ERROR;
457                    continue;
458                }
459
460                ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
461
462                if(U_FAILURE(errorCode)) {
463                    errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
464                            charset, i, u_errorName(errorCode));
465                    errorCode=U_ZERO_ERROR;
466                    continue;
467                }
468
469                // are there items that must be in cnvSet but are not?
470                (diffSet=mapSet).removeAll(cnvSet);
471                if(!diffSet.isEmpty()) {
472                    diffSet.toPattern(s, TRUE);
473                    if(s.length()>100) {
474                        s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
475                    }
476                    errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
477                            charset, i);
478                    errln(s);
479                }
480
481                // are there items that must not be in cnvSet but are?
482                (diffSet=mapnotSet).retainAll(cnvSet);
483                if(!diffSet.isEmpty()) {
484                    diffSet.toPattern(s, TRUE);
485                    if(s.length()>100) {
486                        s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
487                    }
488                    errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
489                            charset, i);
490                    errln(s);
491                }
492            }
493            delete testData;
494        }
495        delete dataModule;
496    }
497    else {
498        dataerrln("Could not load test conversion data");
499    }
500}
501
502U_CDECL_BEGIN
503static void U_CALLCONV
504getUnicodeSetCallback(const void *context,
505                      UConverterFromUnicodeArgs * /*fromUArgs*/,
506                      const UChar* /*codeUnits*/,
507                      int32_t /*length*/,
508                      UChar32 codePoint,
509                      UConverterCallbackReason reason,
510                      UErrorCode *pErrorCode) {
511    if(reason<=UCNV_IRREGULAR) {
512        ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point
513        *pErrorCode=U_ZERO_ERROR;                    // skip
514    }  // else ignore the reset, close and clone calls.
515}
516U_CDECL_END
517
518// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
519void
520ConversionTest::TestGetUnicodeSet2() {
521    // Build a string with all code points.
522    UChar32 cpLimit;
523    int32_t s0Length;
524    if(quick) {
525        cpLimit=s0Length=0x10000;  // BMP only
526    } else {
527        cpLimit=0x110000;
528        s0Length=0x10000+0x200000;  // BMP + surrogate pairs
529    }
530    UChar *s0=new UChar[s0Length];
531    if(s0==NULL) {
532        return;
533    }
534    UChar *s=s0;
535    UChar32 c;
536    UChar c2;
537    // low BMP
538    for(c=0; c<=0xd7ff; ++c) {
539        *s++=(UChar)c;
540    }
541    // trail surrogates
542    for(c=0xdc00; c<=0xdfff; ++c) {
543        *s++=(UChar)c;
544    }
545    // lead surrogates
546    // (after trails so that there is not even one surrogate pair in between)
547    for(c=0xd800; c<=0xdbff; ++c) {
548        *s++=(UChar)c;
549    }
550    // high BMP
551    for(c=0xe000; c<=0xffff; ++c) {
552        *s++=(UChar)c;
553    }
554    // supplementary code points = surrogate pairs
555    if(cpLimit==0x110000) {
556        for(c=0xd800; c<=0xdbff; ++c) {
557            for(c2=0xdc00; c2<=0xdfff; ++c2) {
558                *s++=(UChar)c;
559                *s++=c2;
560            }
561        }
562    }
563
564    static const char *const cnvNames[]={
565        "UTF-8",
566        "UTF-7",
567        "UTF-16",
568        "US-ASCII",
569        "ISO-8859-1",
570        "windows-1252",
571        "Shift-JIS",
572        "ibm-1390",  // EBCDIC_STATEFUL table
573        "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
574        "HZ",
575        "ISO-2022-JP",
576        "JIS7",
577        "ISO-2022-CN",
578        "ISO-2022-CN-EXT",
579        "LMBCS"
580    };
581    LocalUConverterPointer cnv;
582    char buffer[1024];
583    int32_t i;
584    for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
585        UErrorCode errorCode=U_ZERO_ERROR;
586        cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
587        if(U_FAILURE(errorCode)) {
588            errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
589            continue;
590        }
591        UnicodeSet expected;
592        ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
593        if(U_FAILURE(errorCode)) {
594            errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
595            continue;
596        }
597        UConverterUnicodeSet which;
598        for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
599            if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
600                ucnv_setFallback(cnv.getAlias(), TRUE);
601            }
602            expected.add(0, cpLimit-1);
603            s=s0;
604            UBool flush;
605            do {
606                char *t=buffer;
607                flush=(UBool)(s==s0+s0Length);
608                ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
609                if(U_FAILURE(errorCode)) {
610                    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
611                        errorCode=U_ZERO_ERROR;
612                        continue;
613                    } else {
614                        break;  // unexpected error, should not occur
615                    }
616                }
617            } while(!flush);
618            UnicodeSet set;
619            ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
620            if(cpLimit<0x110000) {
621                set.remove(cpLimit, 0x10ffff);
622            }
623            if(which==UCNV_ROUNDTRIP_SET) {
624                // ignore PUA code points because they will be converted even if they
625                // are fallbacks and when other fallbacks are turned off,
626                // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
627                expected.remove(0xe000, 0xf8ff);
628                expected.remove(0xf0000, 0xffffd);
629                expected.remove(0x100000, 0x10fffd);
630                set.remove(0xe000, 0xf8ff);
631                set.remove(0xf0000, 0xffffd);
632                set.remove(0x100000, 0x10fffd);
633            }
634            if(set!=expected) {
635                // First try to see if we have different sets because ucnv_getUnicodeSet()
636                // added strings: The above conversion method does not tell us what strings might be convertible.
637                // Remove strings from the set and compare again.
638                // Unfortunately, there are no good, direct set methods for finding out whether there are strings
639                // in the set, nor for enumerating or removing just them.
640                // Intersect all code points with the set. The intersection will not contain strings.
641                UnicodeSet temp(0, 0x10ffff);
642                temp.retainAll(set);
643                set=temp;
644            }
645            if(set!=expected) {
646                UnicodeSet diffSet;
647                UnicodeString out;
648
649                // are there items that must be in the set but are not?
650                (diffSet=expected).removeAll(set);
651                if(!diffSet.isEmpty()) {
652                    diffSet.toPattern(out, TRUE);
653                    if(out.length()>100) {
654                        out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
655                    }
656                    errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
657                            cnvNames[i], which);
658                    errln(out);
659                }
660
661                // are there items that must not be in the set but are?
662                (diffSet=set).removeAll(expected);
663                if(!diffSet.isEmpty()) {
664                    diffSet.toPattern(out, TRUE);
665                    if(out.length()>100) {
666                        out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
667                    }
668                    errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
669                            cnvNames[i], which);
670                    errln(out);
671                }
672            }
673        }
674    }
675
676    delete [] s0;
677}
678
679// Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
680// If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
681void
682ConversionTest::TestDefaultIgnorableCallback() {
683    UErrorCode status = U_ZERO_ERROR;
684    const char *cnv_name = "euc-jp-2007";
685    const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
686    const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
687
688    UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
689    if (U_FAILURE(status)) {
690        dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
691        return;
692    }
693
694    UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
695    if (U_FAILURE(status)) {
696        dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
697        return;
698    }
699
700    UConverter *cnv = cnv_open(cnv_name, status);
701    if (U_FAILURE(status)) {
702        dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
703        return;
704    }
705
706    // set callback for the converter
707    ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
708
709    UChar32 input[1];
710    char output[10];
711    int32_t outputLength;
712
713    // test default ignorables are ignored
714    int size = set_ignorable->size();
715    for (int i = 0; i < size; i++) {
716        status = U_ZERO_ERROR;
717        outputLength= 0;
718
719        input[0] = set_ignorable->charAt(i);
720
721        outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
722        if (U_FAILURE(status) || outputLength != 0) {
723            errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
724        }
725    }
726
727    // test non-ignorables are not ignored
728    size = set_not_ignorable->size();
729    for (int i = 0; i < size; i++) {
730        status = U_ZERO_ERROR;
731        outputLength= 0;
732
733        input[0] = set_not_ignorable->charAt(i);
734
735        if (input[0] == 0) {
736            continue;
737        }
738
739        outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
740        if (U_FAILURE(status) || outputLength <= 0) {
741            errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
742        }
743    }
744
745    ucnv_close(cnv);
746    delete set_not_ignorable;
747    delete set_ignorable;
748}
749
750// open testdata or ICU data converter ------------------------------------- ***
751
752UConverter *
753ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
754    if(name!=NULL && *name=='+') {
755        // Converter names that start with '+' are ignored in ICU4J tests.
756        ++name;
757    }
758    if(name!=NULL && *name=='*') {
759        /* loadTestData(): set the data directory */
760        return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
761    } else {
762        return ucnv_open(name, &errorCode);
763    }
764}
765
766// output helpers ---------------------------------------------------------- ***
767
768static inline char
769hexDigit(uint8_t digit) {
770    return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
771}
772
773static char *
774printBytes(const uint8_t *bytes, int32_t length, char *out) {
775    uint8_t b;
776
777    if(length>0) {
778        b=*bytes++;
779        --length;
780        *out++=hexDigit((uint8_t)(b>>4));
781        *out++=hexDigit((uint8_t)(b&0xf));
782    }
783
784    while(length>0) {
785        b=*bytes++;
786        --length;
787        *out++=' ';
788        *out++=hexDigit((uint8_t)(b>>4));
789        *out++=hexDigit((uint8_t)(b&0xf));
790    }
791    *out++=0;
792    return out;
793}
794
795static char *
796printUnicode(const UChar *unicode, int32_t length, char *out) {
797    UChar32 c;
798    int32_t i;
799
800    for(i=0; i<length;) {
801        if(i>0) {
802            *out++=' ';
803        }
804        U16_NEXT(unicode, i, length, c);
805        // write 4..6 digits
806        if(c>=0x100000) {
807            *out++='1';
808        }
809        if(c>=0x10000) {
810            *out++=hexDigit((uint8_t)((c>>16)&0xf));
811        }
812        *out++=hexDigit((uint8_t)((c>>12)&0xf));
813        *out++=hexDigit((uint8_t)((c>>8)&0xf));
814        *out++=hexDigit((uint8_t)((c>>4)&0xf));
815        *out++=hexDigit((uint8_t)(c&0xf));
816    }
817    *out++=0;
818    return out;
819}
820
821static char *
822printOffsets(const int32_t *offsets, int32_t length, char *out) {
823    int32_t i, o, d;
824
825    if(offsets==NULL) {
826        length=0;
827    }
828
829    for(i=0; i<length; ++i) {
830        if(i>0) {
831            *out++=' ';
832        }
833        o=offsets[i];
834
835        // print all offsets with 2 characters each (-x, -9..99, xx)
836        if(o<-9) {
837            *out++='-';
838            *out++='x';
839        } else if(o<0) {
840            *out++='-';
841            *out++=(char)('0'-o);
842        } else if(o<=99) {
843            *out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
844            *out++=(char)('0'+o%10);
845        } else /* o>99 */ {
846            *out++='x';
847            *out++='x';
848        }
849    }
850    *out++=0;
851    return out;
852}
853
854// toUnicode test worker functions ----------------------------------------- ***
855
856static int32_t
857stepToUnicode(ConversionCase &cc, UConverter *cnv,
858              UChar *result, int32_t resultCapacity,
859              int32_t *resultOffsets, /* also resultCapacity */
860              int32_t step,
861              UErrorCode *pErrorCode) {
862    const char *source, *sourceLimit, *bytesLimit;
863    UChar *target, *targetLimit, *resultLimit;
864    UBool flush;
865
866    source=(const char *)cc.bytes;
867    target=result;
868    bytesLimit=source+cc.bytesLength;
869    resultLimit=result+resultCapacity;
870
871    if(step>=0) {
872        // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
873        // move only one buffer (in vs. out) at a time to be extra mean
874        // step==0 performs bulk conversion and generates offsets
875
876        // initialize the partial limits for the loop
877        if(step==0) {
878            // use the entire buffers
879            sourceLimit=bytesLimit;
880            targetLimit=resultLimit;
881            flush=cc.finalFlush;
882        } else {
883            // start with empty partial buffers
884            sourceLimit=source;
885            targetLimit=target;
886            flush=FALSE;
887
888            // output offsets only for bulk conversion
889            resultOffsets=NULL;
890        }
891
892        for(;;) {
893            // resetting the opposite conversion direction must not affect this one
894            ucnv_resetFromUnicode(cnv);
895
896            // convert
897            ucnv_toUnicode(cnv,
898                &target, targetLimit,
899                &source, sourceLimit,
900                resultOffsets,
901                flush, pErrorCode);
902
903            // check pointers and errors
904            if(source>sourceLimit || target>targetLimit) {
905                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
906                break;
907            } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
908                if(target!=targetLimit) {
909                    // buffer overflow must only be set when the target is filled
910                    *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
911                    break;
912                } else if(targetLimit==resultLimit) {
913                    // not just a partial overflow
914                    break;
915                }
916
917                // the partial target is filled, set a new limit, reset the error and continue
918                targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
919                *pErrorCode=U_ZERO_ERROR;
920            } else if(U_FAILURE(*pErrorCode)) {
921                // some other error occurred, done
922                break;
923            } else {
924                if(source!=sourceLimit) {
925                    // when no error occurs, then the input must be consumed
926                    *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
927                    break;
928                }
929
930                if(sourceLimit==bytesLimit) {
931                    // we are done
932                    break;
933                }
934
935                // the partial conversion succeeded, set a new limit and continue
936                sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
937                flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
938            }
939        }
940    } else /* step<0 */ {
941        /*
942         * step==-1: call only ucnv_getNextUChar()
943         * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
944         *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
945         *   else give it at most (-step-2)/2 bytes
946         */
947        UChar32 c;
948
949        // end the loop by getting an index out of bounds error
950        for(;;) {
951            // resetting the opposite conversion direction must not affect this one
952            ucnv_resetFromUnicode(cnv);
953
954            // convert
955            if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
956                sourceLimit=source; // use sourceLimit not as a real limit
957                                    // but to remember the pre-getNextUChar source pointer
958                c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
959
960                // check pointers and errors
961                if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
962                    if(source!=bytesLimit) {
963                        *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
964                    } else {
965                        *pErrorCode=U_ZERO_ERROR;
966                    }
967                    break;
968                } else if(U_FAILURE(*pErrorCode)) {
969                    break;
970                }
971                // source may not move if c is from previous overflow
972
973                if(target==resultLimit) {
974                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
975                    break;
976                }
977                if(c<=0xffff) {
978                    *target++=(UChar)c;
979                } else {
980                    *target++=U16_LEAD(c);
981                    if(target==resultLimit) {
982                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
983                        break;
984                    }
985                    *target++=U16_TRAIL(c);
986                }
987
988                // alternate between -n-1 and -n but leave -1 alone
989                if(step<-1) {
990                    ++step;
991                }
992            } else /* step is even */ {
993                // allow only one UChar output
994                targetLimit=target<resultLimit ? target+1 : resultLimit;
995
996                // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
997                // and never output offsets
998                if(step==-2) {
999                    sourceLimit=bytesLimit;
1000                } else {
1001                    sourceLimit=source+(-step-2)/2;
1002                    if(sourceLimit>bytesLimit) {
1003                        sourceLimit=bytesLimit;
1004                    }
1005                }
1006
1007                ucnv_toUnicode(cnv,
1008                    &target, targetLimit,
1009                    &source, sourceLimit,
1010                    NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
1011
1012                // check pointers and errors
1013                if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1014                    if(target!=targetLimit) {
1015                        // buffer overflow must only be set when the target is filled
1016                        *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1017                        break;
1018                    } else if(targetLimit==resultLimit) {
1019                        // not just a partial overflow
1020                        break;
1021                    }
1022
1023                    // the partial target is filled, set a new limit and continue
1024                    *pErrorCode=U_ZERO_ERROR;
1025                } else if(U_FAILURE(*pErrorCode)) {
1026                    // some other error occurred, done
1027                    break;
1028                } else {
1029                    if(source!=sourceLimit) {
1030                        // when no error occurs, then the input must be consumed
1031                        *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1032                        break;
1033                    }
1034
1035                    // we are done (flush==TRUE) but we continue, to get the index out of bounds error above
1036                }
1037
1038                --step;
1039            }
1040        }
1041    }
1042
1043    return (int32_t)(target-result);
1044}
1045
1046UBool
1047ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
1048    // open the converter
1049    IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
1050    LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
1051    if(errorCode.isFailure()) {
1052        errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1053                cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
1054        errorCode.reset();
1055        return FALSE;
1056    }
1057
1058    // set the callback
1059    if(callback!=NULL) {
1060        ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
1061        if(U_FAILURE(errorCode)) {
1062            errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
1063                    cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1064            return FALSE;
1065        }
1066    }
1067
1068    int32_t resultOffsets[256];
1069    UChar result[256];
1070    int32_t resultLength;
1071    UBool ok;
1072
1073    static const struct {
1074        int32_t step;
1075        const char *name;
1076    } steps[]={
1077        { 0, "bulk" }, // must be first for offsets to be checked
1078        { 1, "step=1" },
1079        { 3, "step=3" },
1080        { 7, "step=7" },
1081        { -1, "getNext" },
1082        { -2, "toU(bulk)+getNext" },
1083        { -3, "getNext+toU(bulk)" },
1084        { -4, "toU(1)+getNext" },
1085        { -5, "getNext+toU(1)" },
1086        { -12, "toU(5)+getNext" },
1087        { -13, "getNext+toU(5)" },
1088    };
1089    int32_t i, step;
1090
1091    ok=TRUE;
1092    for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1093        step=steps[i].step;
1094        if(step<0 && !cc.finalFlush) {
1095            // skip ucnv_getNextUChar() if !finalFlush because
1096            // ucnv_getNextUChar() always implies flush
1097            continue;
1098        }
1099        if(step!=0) {
1100            // bulk test is first, then offsets are not checked any more
1101            cc.offsets=NULL;
1102        }
1103        else {
1104            memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1105        }
1106        memset(result, -1, UPRV_LENGTHOF(result));
1107        errorCode.reset();
1108        resultLength=stepToUnicode(cc, cnv.getAlias(),
1109                                result, UPRV_LENGTHOF(result),
1110                                step==0 ? resultOffsets : NULL,
1111                                step, errorCode);
1112        ok=checkToUnicode(
1113                cc, cnv.getAlias(), steps[i].name,
1114                result, resultLength,
1115                cc.offsets!=NULL ? resultOffsets : NULL,
1116                errorCode);
1117        if(errorCode.isFailure() || !cc.finalFlush) {
1118            // reset if an error occurred or we did not flush
1119            // otherwise do nothing to make sure that flushing resets
1120            ucnv_resetToUnicode(cnv.getAlias());
1121        }
1122        if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
1123            errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1124                cc.caseNr, cc.charset, resultLength);
1125        }
1126        if (result[resultLength] != (UChar)-1) {
1127            errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
1128                cc.caseNr, cc.charset, resultLength);
1129        }
1130    }
1131
1132    // not a real loop, just a convenience for breaking out of the block
1133    while(ok && cc.finalFlush) {
1134        // test ucnv_toUChars()
1135        memset(result, 0, sizeof(result));
1136
1137        errorCode.reset();
1138        resultLength=ucnv_toUChars(cnv.getAlias(),
1139                        result, UPRV_LENGTHOF(result),
1140                        (const char *)cc.bytes, cc.bytesLength,
1141                        errorCode);
1142        ok=checkToUnicode(
1143                cc, cnv.getAlias(), "toUChars",
1144                result, resultLength,
1145                NULL,
1146                errorCode);
1147        if(!ok) {
1148            break;
1149        }
1150
1151        // test preflighting
1152        // keep the correct result for simple checking
1153        errorCode.reset();
1154        resultLength=ucnv_toUChars(cnv.getAlias(),
1155                        NULL, 0,
1156                        (const char *)cc.bytes, cc.bytesLength,
1157                        errorCode);
1158        if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
1159            errorCode.reset();
1160        }
1161        ok=checkToUnicode(
1162                cc, cnv.getAlias(), "preflight toUChars",
1163                result, resultLength,
1164                NULL,
1165                errorCode);
1166        break;
1167    }
1168
1169    errorCode.reset();  // all errors have already been reported
1170    return ok;
1171}
1172
1173UBool
1174ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1175                               const UChar *result, int32_t resultLength,
1176                               const int32_t *resultOffsets,
1177                               UErrorCode resultErrorCode) {
1178    char resultInvalidChars[8];
1179    int8_t resultInvalidLength;
1180    UErrorCode errorCode;
1181
1182    const char *msg;
1183
1184    // reset the message; NULL will mean "ok"
1185    msg=NULL;
1186
1187    errorCode=U_ZERO_ERROR;
1188    resultInvalidLength=sizeof(resultInvalidChars);
1189    ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
1190    if(U_FAILURE(errorCode)) {
1191        errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
1192                cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1193        return FALSE;
1194    }
1195
1196    // check everything that might have gone wrong
1197    if(cc.unicodeLength!=resultLength) {
1198        msg="wrong result length";
1199    } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
1200        msg="wrong result string";
1201    } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) {
1202        msg="wrong offsets";
1203    } else if(cc.outErrorCode!=resultErrorCode) {
1204        msg="wrong error code";
1205    } else if(cc.invalidLength!=resultInvalidLength) {
1206        msg="wrong length of last invalid input";
1207    } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
1208        msg="wrong last invalid input";
1209    }
1210
1211    if(msg==NULL) {
1212        return TRUE;
1213    } else {
1214        char buffer[2000]; // one buffer for all strings
1215        char *s, *bytesString, *unicodeString, *resultString,
1216            *offsetsString, *resultOffsetsString,
1217            *invalidCharsString, *resultInvalidCharsString;
1218
1219        bytesString=s=buffer;
1220        s=printBytes(cc.bytes, cc.bytesLength, bytesString);
1221        s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
1222        s=printUnicode(result, resultLength, resultString=s);
1223        s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
1224        s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1225        s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
1226        s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
1227
1228        if((s-buffer)>(int32_t)sizeof(buffer)) {
1229            errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
1230                    cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1231            exit(1);
1232        }
1233
1234        errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1235              "  bytes <%s>[%d]\n"
1236              " expected <%s>[%d]\n"
1237              "  result  <%s>[%d]\n"
1238              " offsets         <%s>\n"
1239              "  result offsets <%s>\n"
1240              " error code expected %s got %s\n"
1241              "  invalidChars expected <%s> got <%s>\n",
1242              cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1243              bytesString, cc.bytesLength,
1244              unicodeString, cc.unicodeLength,
1245              resultString, resultLength,
1246              offsetsString,
1247              resultOffsetsString,
1248              u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1249              invalidCharsString, resultInvalidCharsString);
1250
1251        return FALSE;
1252    }
1253}
1254
1255// fromUnicode test worker functions --------------------------------------- ***
1256
1257static int32_t
1258stepFromUTF8(ConversionCase &cc,
1259             UConverter *utf8Cnv, UConverter *cnv,
1260             char *result, int32_t resultCapacity,
1261             int32_t step,
1262             UErrorCode *pErrorCode) {
1263    const char *source, *sourceLimit, *utf8Limit;
1264    UChar pivotBuffer[32];
1265    UChar *pivotSource, *pivotTarget, *pivotLimit;
1266    char *target, *targetLimit, *resultLimit;
1267    UBool flush;
1268
1269    source=cc.utf8;
1270    pivotSource=pivotTarget=pivotBuffer;
1271    target=result;
1272    utf8Limit=source+cc.utf8Length;
1273    resultLimit=result+resultCapacity;
1274
1275    // call ucnv_convertEx() with in/out buffers no larger than (step) at a time
1276    // move only one buffer (in vs. out) at a time to be extra mean
1277    // step==0 performs bulk conversion
1278
1279    // initialize the partial limits for the loop
1280    if(step==0) {
1281        // use the entire buffers
1282        sourceLimit=utf8Limit;
1283        targetLimit=resultLimit;
1284        flush=cc.finalFlush;
1285
1286        pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
1287    } else {
1288        // start with empty partial buffers
1289        sourceLimit=source;
1290        targetLimit=target;
1291        flush=FALSE;
1292
1293        // empty pivot is not allowed, make it of length step
1294        pivotLimit=pivotBuffer+step;
1295    }
1296
1297    for(;;) {
1298        // resetting the opposite conversion direction must not affect this one
1299        ucnv_resetFromUnicode(utf8Cnv);
1300        ucnv_resetToUnicode(cnv);
1301
1302        // convert
1303        ucnv_convertEx(cnv, utf8Cnv,
1304            &target, targetLimit,
1305            &source, sourceLimit,
1306            pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
1307            FALSE, flush, pErrorCode);
1308
1309        // check pointers and errors
1310        if(source>sourceLimit || target>targetLimit) {
1311            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1312            break;
1313        } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1314            if(target!=targetLimit) {
1315                // buffer overflow must only be set when the target is filled
1316                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1317                break;
1318            } else if(targetLimit==resultLimit) {
1319                // not just a partial overflow
1320                break;
1321            }
1322
1323            // the partial target is filled, set a new limit, reset the error and continue
1324            targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1325            *pErrorCode=U_ZERO_ERROR;
1326        } else if(U_FAILURE(*pErrorCode)) {
1327            if(pivotSource==pivotBuffer) {
1328                // toUnicode error, should not occur
1329                // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1330                break;
1331            } else {
1332                // fromUnicode error
1333                // some other error occurred, done
1334                break;
1335            }
1336        } else {
1337            if(source!=sourceLimit) {
1338                // when no error occurs, then the input must be consumed
1339                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1340                break;
1341            }
1342
1343            if(sourceLimit==utf8Limit) {
1344                // we are done
1345                if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
1346                    // ucnv_convertEx() warns about not terminating the output
1347                    // but ucnv_fromUnicode() does not and so
1348                    // checkFromUnicode() does not expect it
1349                    *pErrorCode=U_ZERO_ERROR;
1350                }
1351                break;
1352            }
1353
1354            // the partial conversion succeeded, set a new limit and continue
1355            sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
1356            flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
1357        }
1358    }
1359
1360    return (int32_t)(target-result);
1361}
1362
1363static int32_t
1364stepFromUnicode(ConversionCase &cc, UConverter *cnv,
1365                char *result, int32_t resultCapacity,
1366                int32_t *resultOffsets, /* also resultCapacity */
1367                int32_t step,
1368                UErrorCode *pErrorCode) {
1369    const UChar *source, *sourceLimit, *unicodeLimit;
1370    char *target, *targetLimit, *resultLimit;
1371    UBool flush;
1372
1373    source=cc.unicode;
1374    target=result;
1375    unicodeLimit=source+cc.unicodeLength;
1376    resultLimit=result+resultCapacity;
1377
1378    // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
1379    // move only one buffer (in vs. out) at a time to be extra mean
1380    // step==0 performs bulk conversion and generates offsets
1381
1382    // initialize the partial limits for the loop
1383    if(step==0) {
1384        // use the entire buffers
1385        sourceLimit=unicodeLimit;
1386        targetLimit=resultLimit;
1387        flush=cc.finalFlush;
1388    } else {
1389        // start with empty partial buffers
1390        sourceLimit=source;
1391        targetLimit=target;
1392        flush=FALSE;
1393
1394        // output offsets only for bulk conversion
1395        resultOffsets=NULL;
1396    }
1397
1398    for(;;) {
1399        // resetting the opposite conversion direction must not affect this one
1400        ucnv_resetToUnicode(cnv);
1401
1402        // convert
1403        ucnv_fromUnicode(cnv,
1404            &target, targetLimit,
1405            &source, sourceLimit,
1406            resultOffsets,
1407            flush, pErrorCode);
1408
1409        // check pointers and errors
1410        if(source>sourceLimit || target>targetLimit) {
1411            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1412            break;
1413        } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1414            if(target!=targetLimit) {
1415                // buffer overflow must only be set when the target is filled
1416                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1417                break;
1418            } else if(targetLimit==resultLimit) {
1419                // not just a partial overflow
1420                break;
1421            }
1422
1423            // the partial target is filled, set a new limit, reset the error and continue
1424            targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1425            *pErrorCode=U_ZERO_ERROR;
1426        } else if(U_FAILURE(*pErrorCode)) {
1427            // some other error occurred, done
1428            break;
1429        } else {
1430            if(source!=sourceLimit) {
1431                // when no error occurs, then the input must be consumed
1432                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1433                break;
1434            }
1435
1436            if(sourceLimit==unicodeLimit) {
1437                // we are done
1438                break;
1439            }
1440
1441            // the partial conversion succeeded, set a new limit and continue
1442            sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
1443            flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
1444        }
1445    }
1446
1447    return (int32_t)(target-result);
1448}
1449
1450UBool
1451ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
1452    UConverter *cnv;
1453    UErrorCode errorCode;
1454
1455    // open the converter
1456    errorCode=U_ZERO_ERROR;
1457    cnv=cnv_open(cc.charset, errorCode);
1458    if(U_FAILURE(errorCode)) {
1459        errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1460                cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1461        return FALSE;
1462    }
1463    ucnv_resetToUnicode(utf8Cnv);
1464
1465    // set the callback
1466    if(callback!=NULL) {
1467        ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
1468        if(U_FAILURE(errorCode)) {
1469            errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
1470                    cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1471            ucnv_close(cnv);
1472            return FALSE;
1473        }
1474    }
1475
1476    // set the fallbacks flag
1477    // TODO change with Jitterbug 2401, then add a similar call for toUnicode too
1478    ucnv_setFallback(cnv, cc.fallbacks);
1479
1480    // set the subchar
1481    int32_t length;
1482
1483    if(cc.setSub>0) {
1484        length=(int32_t)strlen(cc.subchar);
1485        ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
1486        if(U_FAILURE(errorCode)) {
1487            errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
1488                    cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1489            ucnv_close(cnv);
1490            return FALSE;
1491        }
1492    } else if(cc.setSub<0) {
1493        ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
1494        if(U_FAILURE(errorCode)) {
1495            errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
1496                    cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1497            ucnv_close(cnv);
1498            return FALSE;
1499        }
1500    }
1501
1502    // convert unicode to utf8
1503    char utf8[256];
1504    cc.utf8=utf8;
1505    u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
1506                cc.unicode, cc.unicodeLength,
1507                &errorCode);
1508    if(U_FAILURE(errorCode)) {
1509        // skip UTF-8 testing of a string with an unpaired surrogate,
1510        // or of one that's too long
1511        // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1512        cc.utf8Length=-1;
1513    }
1514
1515    int32_t resultOffsets[256];
1516    char result[256];
1517    int32_t resultLength;
1518    UBool ok;
1519
1520    static const struct {
1521        int32_t step;
1522        const char *name, *utf8Name;
1523    } steps[]={
1524        { 0, "bulk",   "utf8" }, // must be first for offsets to be checked
1525        { 1, "step=1", "utf8 step=1" },
1526        { 3, "step=3", "utf8 step=3" },
1527        { 7, "step=7", "utf8 step=7" }
1528    };
1529    int32_t i, step;
1530
1531    ok=TRUE;
1532    for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1533        step=steps[i].step;
1534        memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1535        memset(result, -1, UPRV_LENGTHOF(result));
1536        errorCode=U_ZERO_ERROR;
1537        resultLength=stepFromUnicode(cc, cnv,
1538                                result, UPRV_LENGTHOF(result),
1539                                step==0 ? resultOffsets : NULL,
1540                                step, &errorCode);
1541        ok=checkFromUnicode(
1542                cc, cnv, steps[i].name,
1543                (uint8_t *)result, resultLength,
1544                cc.offsets!=NULL ? resultOffsets : NULL,
1545                errorCode);
1546        if(U_FAILURE(errorCode) || !cc.finalFlush) {
1547            // reset if an error occurred or we did not flush
1548            // otherwise do nothing to make sure that flushing resets
1549            ucnv_resetFromUnicode(cnv);
1550        }
1551        if (resultOffsets[resultLength] != -1) {
1552            errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1553                cc.caseNr, cc.charset, resultLength);
1554        }
1555        if (result[resultLength] != (char)-1) {
1556            errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
1557                cc.caseNr, cc.charset, resultLength);
1558        }
1559
1560        // bulk test is first, then offsets are not checked any more
1561        cc.offsets=NULL;
1562
1563        // test direct conversion from UTF-8
1564        if(cc.utf8Length>=0) {
1565            errorCode=U_ZERO_ERROR;
1566            resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
1567                                    result, UPRV_LENGTHOF(result),
1568                                    step, &errorCode);
1569            ok=checkFromUnicode(
1570                    cc, cnv, steps[i].utf8Name,
1571                    (uint8_t *)result, resultLength,
1572                    NULL,
1573                    errorCode);
1574            if(U_FAILURE(errorCode) || !cc.finalFlush) {
1575                // reset if an error occurred or we did not flush
1576                // otherwise do nothing to make sure that flushing resets
1577                ucnv_resetToUnicode(utf8Cnv);
1578                ucnv_resetFromUnicode(cnv);
1579            }
1580        }
1581    }
1582
1583    // not a real loop, just a convenience for breaking out of the block
1584    while(ok && cc.finalFlush) {
1585        // test ucnv_fromUChars()
1586        memset(result, 0, sizeof(result));
1587
1588        errorCode=U_ZERO_ERROR;
1589        resultLength=ucnv_fromUChars(cnv,
1590                        result, UPRV_LENGTHOF(result),
1591                        cc.unicode, cc.unicodeLength,
1592                        &errorCode);
1593        ok=checkFromUnicode(
1594                cc, cnv, "fromUChars",
1595                (uint8_t *)result, resultLength,
1596                NULL,
1597                errorCode);
1598        if(!ok) {
1599            break;
1600        }
1601
1602        // test preflighting
1603        // keep the correct result for simple checking
1604        errorCode=U_ZERO_ERROR;
1605        resultLength=ucnv_fromUChars(cnv,
1606                        NULL, 0,
1607                        cc.unicode, cc.unicodeLength,
1608                        &errorCode);
1609        if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1610            errorCode=U_ZERO_ERROR;
1611        }
1612        ok=checkFromUnicode(
1613                cc, cnv, "preflight fromUChars",
1614                (uint8_t *)result, resultLength,
1615                NULL,
1616                errorCode);
1617        break;
1618    }
1619
1620    ucnv_close(cnv);
1621    return ok;
1622}
1623
1624UBool
1625ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1626                                 const uint8_t *result, int32_t resultLength,
1627                                 const int32_t *resultOffsets,
1628                                 UErrorCode resultErrorCode) {
1629    UChar resultInvalidUChars[8];
1630    int8_t resultInvalidLength;
1631    UErrorCode errorCode;
1632
1633    const char *msg;
1634
1635    // reset the message; NULL will mean "ok"
1636    msg=NULL;
1637
1638    errorCode=U_ZERO_ERROR;
1639    resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
1640    ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
1641    if(U_FAILURE(errorCode)) {
1642        errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
1643                cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1644        return FALSE;
1645    }
1646
1647    // check everything that might have gone wrong
1648    if(cc.bytesLength!=resultLength) {
1649        msg="wrong result length";
1650    } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
1651        msg="wrong result string";
1652    } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) {
1653        msg="wrong offsets";
1654    } else if(cc.outErrorCode!=resultErrorCode) {
1655        msg="wrong error code";
1656    } else if(cc.invalidLength!=resultInvalidLength) {
1657        msg="wrong length of last invalid input";
1658    } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
1659        msg="wrong last invalid input";
1660    }
1661
1662    if(msg==NULL) {
1663        return TRUE;
1664    } else {
1665        char buffer[2000]; // one buffer for all strings
1666        char *s, *unicodeString, *bytesString, *resultString,
1667            *offsetsString, *resultOffsetsString,
1668            *invalidCharsString, *resultInvalidUCharsString;
1669
1670        unicodeString=s=buffer;
1671        s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
1672        s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
1673        s=printBytes(result, resultLength, resultString=s);
1674        s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
1675        s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1676        s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
1677        s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
1678
1679        if((s-buffer)>(int32_t)sizeof(buffer)) {
1680            errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
1681                    cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1682            exit(1);
1683        }
1684
1685        errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1686              "  unicode <%s>[%d]\n"
1687              " expected <%s>[%d]\n"
1688              "  result  <%s>[%d]\n"
1689              " offsets         <%s>\n"
1690              "  result offsets <%s>\n"
1691              " error code expected %s got %s\n"
1692              "  invalidChars expected <%s> got <%s>\n",
1693              cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1694              unicodeString, cc.unicodeLength,
1695              bytesString, cc.bytesLength,
1696              resultString, resultLength,
1697              offsetsString,
1698              resultOffsetsString,
1699              u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1700              invalidCharsString, resultInvalidUCharsString);
1701
1702        return FALSE;
1703    }
1704}
1705
1706#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
1707