1/*
2 *******************************************************************************
3 * Copyright (C) 2002-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8package com.ibm.icu.dev.test.charset;
9
10import java.nio.ByteBuffer;
11import java.nio.CharBuffer;
12import java.nio.charset.Charset;
13import java.nio.charset.CharsetDecoder;
14import java.nio.charset.CharsetEncoder;
15import java.nio.charset.CoderResult;
16import java.nio.charset.CodingErrorAction;
17import java.util.Iterator;
18
19import com.ibm.icu.charset.CharsetCallback;
20import com.ibm.icu.charset.CharsetDecoderICU;
21import com.ibm.icu.charset.CharsetEncoderICU;
22import com.ibm.icu.charset.CharsetICU;
23import com.ibm.icu.charset.CharsetProviderICU;
24import com.ibm.icu.dev.test.ModuleTest;
25import com.ibm.icu.dev.test.TestDataModule.DataMap;
26import com.ibm.icu.impl.ICUResourceBundle;
27import com.ibm.icu.text.UnicodeSet;
28
29/**
30 * This maps to convtest.c which tests the test file for data-driven conversion tests.
31 *
32 */
33public class TestConversion extends ModuleTest {
34    /**
35     * This maps to the C struct of conversion case in convtest.h that stores the
36     * data for a conversion test
37     *
38     */
39    private class ConversionCase {
40        int caseNr;                                             // testcase index
41        String option = null;                                   // callback options
42        CodingErrorAction cbErrorAction = null;                 // callback action type
43        CharBuffer toUnicodeResult = null;
44        ByteBuffer fromUnicodeResult = null;
45
46        // data retrieved from a test case conversion.txt
47        String charset;                                         // charset
48        String unicode;                                         // unicode string
49        ByteBuffer bytes;                                       // byte
50        int[] offsets;                                          // offsets
51        boolean finalFlush;                                     // flush
52        boolean fallbacks;                                      // fallback
53        String outErrorCode;                                    // errorCode
54        String cbopt;                                           // callback
55
56        // TestGetUnicodeSet variables
57        String map;
58        String mapnot;
59        int which;
60
61        // CharsetCallback encoder and decoder
62        CharsetCallback.Decoder cbDecoder = null;
63        CharsetCallback.Encoder cbEncoder = null;
64
65        String caseNrAsString() {
66            return "[" + caseNr + "]";
67        }
68    }
69
70    /* In the data-driven conversion test, converters that are not available in
71     * ICU4J are marked with the following leading symbol.
72     */
73    private static final char UNSUPPORTED_CHARSET_SYMBOL = '+';
74
75    // public methods --------------------------------------------------------
76
77    public static void main(String[] args) throws Exception {
78        new TestConversion().run(args);
79    }
80
81    public TestConversion() {
82        super("com/ibm/icu/dev/data/testdata/", "conversion");
83    }
84
85    /*
86     * This method maps to the convtest.cpp runIndexedTest() method to run each
87     * type of conversion.
88     */
89    public void processModules() {
90        try {
91            int testFromUnicode = 0;
92            int testToUnicode = 0;
93            String testName = t.getName().toString();
94
95            // Iterate through and get each of the test case to process
96            for (Iterator iter = t.getDataIterator(); iter.hasNext();) {
97                DataMap testcase = (DataMap) iter.next();
98
99                if (testName.equalsIgnoreCase("toUnicode")) {
100                    TestToUnicode(testcase, testToUnicode);
101                    testToUnicode++;
102
103                } else if (testName.equalsIgnoreCase("fromUnicode")) {
104                    TestFromUnicode(testcase, testFromUnicode);
105                    testFromUnicode++;
106                } else if (testName.equalsIgnoreCase("getUnicodeSet")) {
107                    TestGetUnicodeSet(testcase);
108                } else {
109                    warnln("Could not load the test cases for conversion");
110                    continue;
111                }
112            }
113        } catch (Exception e) {
114            e.printStackTrace();
115        }
116
117    }
118
119    // private methods -------------------------------------------------------
120
121
122    // fromUnicode test worker functions ---------------------------------------
123    private void TestFromUnicode(DataMap testcase, int caseNr) {
124
125        ConversionCase cc = new ConversionCase();
126
127       try {
128            // retrieve test case data
129            cc.caseNr = caseNr;
130            cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString();
131            cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString();
132            cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary();
133            cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector();
134            cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0;
135            cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0;
136            cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString();
137            cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString();
138
139        } catch (Exception e) {
140            errln("Skipping test:");
141            errln("error parsing conversion/toUnicode test case " + cc.caseNr);
142            return;
143        }
144
145        /*
146         * Skip the following data driven converter tests.
147         * These tests were added to the data driven conversion test in ICU
148         * to test direct-from-UTF-8 m:n Unicode:charset conversion.
149         * This feature is not in ICU4J.
150         * See #9601
151         */
152        // Android patch: Skip tests that fail with customized data.
153        String [] testsToSkip = {
154                "*test2",
155                "EUC-TW",
156                "gb18030",
157                "HZ",
158                "ibm-1386",
159                "ibm-1390",
160                "ibm-1390,swaplfnl",
161                "ibm-1399",
162                "ibm-16684",
163                "ibm-25546",
164                "ibm-930",
165                "ibm-943",
166                "ibm-970",
167                "ibm-971",
168                "IBM-eucJP",
169                "iso-2022-cn",
170                "ISO-2022-CN",
171                "iso-2022-jp",
172                "ISO-2022-JP",
173                "ISO-2022-JP-2",
174                "iso-2022-kr",
175                "ISO-2022-KR",
176                "JIS",
177                "JIS7",
178                "JIS8",
179                "lmbcs",
180                "windows-936",
181                "x11-compound-text"
182        };
183        // Android patch end.
184        for (int i = 0; i < testsToSkip.length; i++) {
185            if (cc.charset.equals(testsToSkip[i])) {
186                logln("");
187                logln("Skipping: " + cc.charset);
188                logln("...............................................");
189                return;
190            }
191        }
192
193        // ----for debugging only
194        logln("");
195        logln("TestFromUnicode[" + caseNr + "] " + cc.charset + " ");
196        logln("Unicode:   " + cc.unicode);
197        logln("Bytes:    " + printbytes(cc.bytes, cc.bytes.limit()));
198        ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes());
199        logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")");
200        logln("...............................................");
201
202        // process the retrieved test data case
203        if (cc.offsets.length == 0) {
204            cc.offsets = null;
205        } else if (cc.offsets.length != cc.bytes.limit()) {
206            errln("fromUnicode[" + cc.caseNr + "] bytes[" + cc.bytes
207                    + "] and offsets[" + cc.offsets.length
208                    + "] must have the same length");
209            return;
210        }
211
212        // check the callback replacement value
213        if (cc.cbopt.length() > 0) {
214
215            switch ((cc.cbopt).charAt(0)) {
216            case '?':
217                cc.cbErrorAction = CodingErrorAction.REPLACE;
218                break;
219            case '0':
220                cc.cbErrorAction = CodingErrorAction.IGNORE;
221                break;
222            case '.':
223                cc.cbErrorAction = CodingErrorAction.REPORT;
224                break;
225            case '&':
226                cc.cbErrorAction = CodingErrorAction.REPLACE;
227                cc.cbEncoder = CharsetCallback.FROM_U_CALLBACK_ESCAPE;
228                break;
229            default:
230                cc.cbErrorAction = null;
231                break;
232            }
233
234            // check for any options for the callback value --
235            cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt
236                    .substring(1);
237            if (cc.option == null) {
238                cc.option = null;
239            }
240        }
241        FromUnicodeCase(cc);
242    }
243
244
245    private void FromUnicodeCase(ConversionCase cc) {
246        // create charset encoder for conversion test
247        CharsetProviderICU provider = new CharsetProviderICU();
248        CharsetEncoder encoder = null;
249        Charset charset = null;
250        try {
251            // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
252            charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
253                    ? (Charset) provider.charsetForName(cc.charset.substring(1),
254                        "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
255                    : (Charset) provider.charsetForName(cc.charset);
256            if (charset != null) {
257                encoder = (CharsetEncoder) charset.newEncoder();
258                encoder.onMalformedInput(CodingErrorAction.REPLACE);
259                encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
260                if (encoder instanceof CharsetEncoderICU) {
261                    ((CharsetEncoderICU)encoder).setFallbackUsed(cc.fallbacks);
262                    if (((CharsetEncoderICU)encoder).isFallbackUsed() != cc.fallbacks) {
263                        errln("Fallback could not be set for " + cc.charset);
264                    }
265                }
266            }
267        } catch (Exception e) {
268            encoder = null;
269        }
270        if (encoder == null) {
271            if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) {
272                logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time");
273            } else {
274                errln(cc.charset + " was not found");
275            }
276            return;
277        }
278
279        // set the callback for the encoder
280        if (cc.cbErrorAction != null) {
281            if (cc.cbEncoder != null) {
282                ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option);
283                ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.unmappableForLength(1), cc.cbEncoder, cc.option);
284            } else {
285                encoder.onUnmappableCharacter(cc.cbErrorAction);
286                encoder.onMalformedInput(cc.cbErrorAction);
287            }
288
289            // if action has an option, put in the option for the case
290            if (cc.option.equals("i")) {
291                encoder.onMalformedInput(CodingErrorAction.REPORT);
292            }
293
294            // if callback action is replace,
295          //   and there is a subchar
296            // replace the decoder's default replacement value
297            // if substring, skip test due to current api not supporting
298            // substring
299            if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) {
300                if (cc.cbopt.length() > 1) {
301                    if (cc.cbopt.length() > 1 && cc.cbopt.charAt(1) == '=') {
302                        logln("Skipping test due to limitation in Java API - substitution string not supported");
303                        return;
304                    } else {
305                        // // read NUL-separated subchar first, if any
306                        // copy the subchar from Latin-1 characters
307                        // start after the NUL
308                        if (cc.cbopt.charAt(1) == 0x00) {
309                            cc.cbopt = cc.cbopt.substring(2);
310
311                            try {
312                                encoder.replaceWith(toByteArray(cc.cbopt));
313                            } catch (Exception e) {
314                                logln("Skipping test due to limitation in Java API - substitution character sequence size error");
315                                return;
316                            }
317                        }
318                    }
319                }
320            }
321        }
322
323        // do charset encoding from unicode
324
325        // testing by steps using charset.encoder(in,out,flush)
326        int resultLength;
327        boolean ok;
328        String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
329                { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
330        int i, step;
331
332        ok = true;
333
334        for (i = 0; i < steps.length && ok; ++i) {
335            step = Integer.parseInt(steps[i][0]);
336
337            logln("Testing step:[" + step + "]");
338            try {
339                resultLength = stepFromUnicode(cc, encoder, step);
340                ok = checkFromUnicode(cc, resultLength);
341            } catch (Exception ex) {
342                errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]");
343                ex.printStackTrace(System.out);
344                return;
345            }
346
347        }
348        // testing by whole buffer using out = charset.encoder(in)
349        while (ok && cc.finalFlush) {
350            logln("Testing java API charset.encoder(in):");
351            cc.fromUnicodeResult = null;
352            ByteBuffer out = null;
353
354            try {
355                out = encoder.encode(CharBuffer.wrap(cc.unicode.toCharArray()));
356                out.position(out.limit());
357                if (out.limit() != out.capacity() || cc.finalFlush) {
358                    int pos = out.position();
359                    byte[] temp = out.array();
360                    out = ByteBuffer.allocate(temp.length * 4);
361                    out.put(temp);
362                    out.position(pos);
363                    CoderResult cr = encoder.flush(out);
364                    if (cr.isOverflow()) {
365                        logln("Overflow error with flushing encoder");
366                    }
367                }
368                cc.fromUnicodeResult = out;
369
370                ok = checkFromUnicode(cc, out.limit());
371                if (!ok) {
372                    break;
373                }
374            } catch (Exception e) {
375                //check the error code to see if it matches cc.errorCode
376                logln("Encoder returned an error code");
377                logln("ErrorCode expected is: " + cc.outErrorCode);
378                logln("Error Result is: " + e.toString());
379            }
380            break;
381        }
382    }
383
384    private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) {
385        if (step < 0) {
386            errln("Negative step size, test internal error.");
387            return 0;
388        }
389
390        int sourceLen = cc.unicode.length();
391        int targetLen = cc.bytes.capacity() + 20;  // for BOM, and to let failures produce excess output
392        CharBuffer source = CharBuffer.wrap(cc.unicode.toCharArray());
393        ByteBuffer target = ByteBuffer.allocate(targetLen);
394        cc.fromUnicodeResult = null;
395        encoder.reset();
396
397        int currentSourceLimit;
398        int currentTargetLimit;
399        if (step > 0) {
400            currentSourceLimit = Math.min(step, sourceLen);
401            currentTargetLimit = Math.min(step, targetLen);
402        } else {
403            currentSourceLimit = sourceLen;
404            currentTargetLimit = targetLen;
405        }
406
407        CoderResult cr = null;
408
409        for (;;) {
410            source.limit(currentSourceLimit);
411            target.limit(currentTargetLimit);
412
413            cr = encoder.encode(source, target, currentSourceLimit == sourceLen);
414
415            if (cr.isUnderflow()) {
416                if (currentSourceLimit == sourceLen) {
417                    if (target.position() == cc.bytes.limit()) {
418                        // target contains the correct number of bytes
419                        break;
420                    }
421                    // Do a final flush for cleanup, then break out
422                    // Encode loop, exits with cr==underflow in normal operation.
423                    //target.limit(targetLen);
424                    target.limit(targetLen);
425                    cr = encoder.flush(target);
426                    if (cr.isUnderflow()) {
427                        // good
428                    } else if (cr.isOverflow()) {
429                        errln(cc.caseNrAsString() + " Flush is producing excessive output");
430                    } else {
431                        errln(cc.caseNrAsString() + " Flush operation failed.  CoderResult = \""
432                                + cr.toString() + "\"");
433                    }
434                    break;
435                }
436                currentSourceLimit = Math.min(currentSourceLimit + step, sourceLen);
437            } else if (cr.isOverflow()) {
438                if (currentTargetLimit == targetLen) {
439                    errln(cc.caseNrAsString() + " encode() is producing excessive output");
440                    break;
441                }
442                currentTargetLimit = Math.min(currentTargetLimit + step, targetLen);
443            } else {
444                // check the error code to see if it matches cc.errorCode
445                logln("Encoder returned an error code");
446                logln("ErrorCode expected is: " + cc.outErrorCode);
447                logln("Error Result is: " + cr.toString());
448                break;
449            }
450
451        }
452
453        cc.fromUnicodeResult = target;
454        return target.position();
455    }
456
457    private boolean checkFromUnicode(ConversionCase cc, int resultLength) {
458        return checkResultsFromUnicode(cc, cc.bytes, cc.fromUnicodeResult);
459    }
460
461    // toUnicode test worker functions ----------------------------------------- ***
462
463    private void TestToUnicode(DataMap testcase, int caseNr) {
464        // create Conversion case to store the test case data
465        ConversionCase cc = new ConversionCase();
466
467        try {
468            // retrieve test case data
469            cc.caseNr = caseNr;
470            cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString();
471            cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary();
472            cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString();
473            cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector();
474            cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0;
475            cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0;
476            cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString();
477            cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString();
478
479        } catch (Exception e) {
480            errln("Skipping test: error parsing conversion/toUnicode test case " + cc.caseNr);
481            return;
482        }
483
484        // Android patch: Skip tests that fail with customized data.
485        String [] testsToSkip = {
486                "HZ",
487                "ibm-1390",
488                "ibm-1390,swaplfnl",
489                "ibm-16684",
490                "ibm-25546",
491                "ibm-971",
492                "ISO-2022-CN",
493                "ISO-2022-JP",
494                "ISO-2022-JP-2",
495                "ISO-2022-KR",
496                "JIS7"
497        };
498        for (int i = 0; i < testsToSkip.length; i++) {
499            if (cc.charset.equals(testsToSkip[i])) {
500                logln("");
501                logln("Skipping: " + cc.charset);
502                logln("...............................................");
503                return;
504            }
505        }
506        // Android patch end.
507
508        // ----for debugging only
509        logln("");
510        logln("TestToUnicode[" + caseNr + "] " + cc.charset + " ");
511        logln("Unicode:   " + hex(cc.unicode));
512        logln("Bytes:    " + printbytes(cc.bytes, cc.bytes.limit()));
513        ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes());
514        logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")");
515        logln("...............................................");
516
517        // process the retrieved test data case
518        if (cc.offsets.length == 0) {
519            cc.offsets = null;
520        } else if (cc.offsets.length != cc.unicode.length()) {
521            errln("Skipping test: toUnicode[" + cc.caseNr + "] unicode["
522                    + cc.unicode.length() + "] and offsets["
523                    + cc.offsets.length + "] must have the same length");
524            return;
525        }
526        // check for the callback replacement value for unmappable
527        // characters or malformed errors
528        if (cc.cbopt.length() > 0) {
529            switch ((cc.cbopt).charAt(0)) {
530            case '?': // CALLBACK_SUBSTITUTE
531                cc.cbErrorAction = CodingErrorAction.REPLACE;
532                break;
533            case '0': // CALLBACK_SKIP
534                cc.cbErrorAction = CodingErrorAction.IGNORE;
535                break;
536            case '.': // CALLBACK_STOP
537                cc.cbErrorAction = CodingErrorAction.REPORT;
538                break;
539            case '&': // CALLBACK_ESCAPE
540                cc.cbErrorAction = CodingErrorAction.REPORT;
541                cc.cbDecoder = CharsetCallback.TO_U_CALLBACK_ESCAPE;
542                break;
543            default:
544                cc.cbErrorAction = null;
545                break;
546            }
547        }
548        // check for any options for the callback value
549        cc.option = cc.cbErrorAction == null ? null : cc.cbopt.substring(1);
550        if (cc.option == null) {
551            cc.option = null;
552        }
553
554        ToUnicodeCase(cc);
555
556    }
557
558    private void ToUnicodeCase(ConversionCase cc) {
559
560        // create converter for charset and decoder for each test case
561        CharsetProviderICU provider = new CharsetProviderICU();
562        CharsetDecoder decoder = null;
563        Charset charset = null;
564
565        try {
566            // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
567            charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
568                    ? (Charset) provider.charsetForName(cc.charset.substring(1),
569                        "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
570                    : (Charset) provider.charsetForName(cc.charset);
571            if (charset != null) {
572                decoder = (CharsetDecoder) charset.newDecoder();
573                decoder.onMalformedInput(CodingErrorAction.REPLACE);
574                decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
575            }
576        } catch (Exception e) {
577            // TODO implement loading of test data.
578            decoder = null;
579        }
580        if (decoder == null) {
581            if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) {
582                logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time");
583            } else {
584                errln(cc.charset + " was not found");
585            }
586            return;
587        }
588
589        // set the callback for the decoder
590        if (cc.cbErrorAction != null) {
591            if (cc.cbDecoder != null) {
592                ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.malformedForLength(1), cc.cbDecoder, cc.option);
593                ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.unmappableForLength(1), cc.cbDecoder, cc.option);
594            } else {
595                decoder.onMalformedInput(cc.cbErrorAction);
596                decoder.onUnmappableCharacter(cc.cbErrorAction);
597            }
598
599            // set the options (if any: SKIP_STOP_ON_ILLEGAL) for callback
600            if (cc.option.equals("i")) {
601                decoder.onMalformedInput(CodingErrorAction.REPORT);
602            }
603
604            // if callback action is replace, and there is a subchar
605            // replace the decoder's default replacement value
606            // if substring, skip test due to current api not supporting
607            // substring replacement
608            if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) {
609                if (cc.cbopt.length() > 1) {
610                    if (cc.cbopt.charAt(1) == '=') {
611                        logln("Skipping test due to limitation in Java API - substitution string not supported");
612
613                    } else {
614                        // // read NUL-separated subchar first, if any
615                        // copy the subchar from Latin-1 characters
616                        // start after the NUL
617                        if (cc.cbopt.charAt(1) == 0x00) {
618                            cc.cbopt = cc.cbopt.substring(2);
619
620                            try {
621                                decoder.replaceWith(cc.cbopt);
622                            } catch (Exception e) {
623                                logln("Skipping test due to limitation in Java API - substitution character sequence size error");
624                            }
625                        }
626                    }
627                }
628            }
629        }
630
631        //      Check the step to unicode
632        boolean ok;
633        int resultLength;
634
635        String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
636                { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
637        /* TODO: currently not supported test steps, getNext API is not supported for now
638         { "-1", "getNext" },
639         { "-2", "toU(bulk)+getNext" },
640         { "-3", "getNext+toU(bulk)" },
641         { "-4", "toU(1)+getNext" },
642         { "-5", "getNext+toU(1)" },
643         { "-12", "toU(5)+getNext" },
644         { "-13", "getNext+toU(5)" }};*/
645
646        ok = true;
647        int step;
648        // testing by steps using the CoderResult cr = charset.decoder(in,out,flush) api
649        for (int i = 0; i < steps.length && ok; ++i) {
650            step = Integer.parseInt(steps[i][0]);
651
652            if (step < 0 && !cc.finalFlush) {
653                continue;
654            }
655            logln("Testing step:[" + step + "]");
656
657            try {
658                resultLength = stepToUnicode(cc, decoder, step);
659                ok = checkToUnicode(cc, resultLength);
660            } catch (Exception ex) {
661                errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]");
662                ex.printStackTrace(System.out);
663                return;
664            }
665        }
666
667        //testing the java's out = charset.decoder(in) api
668        while (ok && cc.finalFlush) {
669            logln("Testing java charset.decoder(in):");
670            cc.toUnicodeResult = null;
671            CharBuffer out = null;
672
673            try {
674                cc.bytes.rewind();
675                out = decoder.decode(cc.bytes);
676                out.position(out.limit());
677                if (out.limit() < cc.unicode.length()) {
678                    int pos = out.position();
679                    char[] temp = out.array();
680                    out = CharBuffer.allocate(cc.bytes.limit());
681                    out.put(temp);
682                    out.position(pos);
683                    CoderResult cr = decoder.flush(out);
684                    if (cr.isOverflow()) {
685                        logln("Overflow error with flushing decodering");
686                    }
687                }
688
689                cc.toUnicodeResult = out;
690
691                ok = checkToUnicode(cc, out.limit());
692                if (!ok) {
693                    break;
694                }
695            } catch (Exception e) {
696                //check the error code to see if it matches cc.errorCode
697                logln("Decoder returned an error code");
698                logln("ErrorCode expected is: " + cc.outErrorCode);
699                logln("Error Result is: " + e.toString());
700            }
701            break;
702        }
703
704        return;
705    }
706
707
708
709
710    private int stepToUnicode(ConversionCase cc, CharsetDecoder decoder,
711            int step)
712
713    {
714        ByteBuffer source;
715        CharBuffer target;
716        boolean flush = false;
717        int sourceLen;
718        source = cc.bytes;
719        sourceLen = cc.bytes.limit();
720        source.position(0);
721        target = CharBuffer.allocate(cc.unicode.length() + 4);
722        target.position(0);
723        cc.toUnicodeResult = null;
724        decoder.reset();
725
726        if (step >= 0) {
727
728            int iStep = step;
729            int oStep = step;
730
731            for (;;) {
732
733                if (step != 0) {
734                    source.limit((iStep <= sourceLen) ? iStep : sourceLen);
735                    target.limit((oStep <= target.capacity()) ? oStep : target
736                            .capacity());
737                    flush = (cc.finalFlush && source.limit() == sourceLen);
738
739                } else {
740                    //bulk mode
741                    source.limit(sourceLen);
742                    target.limit(target.capacity());
743                    flush = cc.finalFlush;
744                }
745                // convert
746                CoderResult cr = null;
747                if (source.hasRemaining()) {
748
749                    cr = decoder.decode(source, target, flush);
750                    // check pointers and errors
751                    if (cr.isOverflow()) {
752                        // the partial target is filled, set a new limit,
753                        oStep = (target.position() + step);
754                        target.limit((oStep < target.capacity()) ? oStep
755                                : target.capacity());
756                        if (target.limit() > target.capacity()) {
757                            //target has reached its limit, an error occurred or test case has an error code
758                            //check error code
759                            logln("UnExpected error: Target Buffer is larger than capacity");
760                            break;
761                        }
762
763                    } else if (cr.isError()) {
764                        //check the error code to see if it matches cc.errorCode
765                        logln("Decoder returned an error code");
766                        logln("ErrorCode expected is: " + cc.outErrorCode);
767                        logln("Error Result is: " + cr.toString());
768                        break;
769                    }
770
771                } else {
772                    if (source.limit() == sourceLen) {
773
774                        cr = decoder.decode(source, target, true);
775
776                        //due to limitation of the API we need to check for target limit for expected
777                        if (target.position() != cc.unicode.length()) {
778                            if (target.limit() != cc.unicode.length()) {
779                                target.limit(cc.unicode.length());
780                            }
781                            cr = decoder.flush(target);
782                            if (cr.isError()) {
783                                errln("Flush operation failed");
784                            }
785                        }
786                        break;
787                    }
788                }
789                iStep += step;
790
791            }
792
793        }// if(step ==0)
794
795        //--------------------------------------------------------------------------
796        else /* step<0 */{
797            /*
798             * step==-1: call only ucnv_getNextUChar()
799             * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
800             *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
801             *   else give it at most (-step-2)/2 bytes
802             */
803
804            for (;;) {
805                // convert
806                if ((step & 1) != 0 /* odd: -1, -3, -5, ... */) {
807
808                    target.limit(target.position() < target.capacity() ? target
809                            .position() + 1 : target.capacity());
810
811                    // decode behavior is return to output target 1 character
812                    CoderResult cr = null;
813
814                    //similar to getNextUChar() , input is the whole string, while outputs only 1 character
815                    source.limit(sourceLen);
816                    while (target.position() != target.limit()
817                            && source.hasRemaining()) {
818                        cr = decoder.decode(source, target,
819                                source.limit() == sourceLen);
820
821                        if (cr.isOverflow()) {
822
823                            if (target.limit() >= target.capacity()) {
824                                // target has reached its limit, an error occurred
825                                logln("UnExpected error: Target Buffer is larger than capacity");
826                                break;
827                            } else {
828                                //1 character has been consumed
829                                target.limit(target.position() + 1);
830                                break;
831                            }
832                        } else if (cr.isError()) {
833                            logln("Decoder returned an error code");
834                            logln("ErrorCode expected is: " + cc.outErrorCode);
835                            logln("Error Result is: " + cr.toString());
836
837                            cc.toUnicodeResult = target;
838                            return target.position();
839                        }
840
841                        else {
842                            // one character has been consumed
843                            if (target.limit() == target.position()) {
844                                target.limit(target.position() + 1);
845                                break;
846                            }
847                        }
848
849                    }
850                    if (source.position() == sourceLen) {
851
852                        // due to limitation of the API we need to check
853                        // for target limit for expected
854                        cr = decoder.decode(source, target, true);
855                        if (target.position() != cc.unicode.length()) {
856
857                            target.limit(cc.unicode.length());
858                            cr = decoder.flush(target);
859                            if (cr.isError()) {
860                                errln("Flush operation failed");
861                            }
862                        }
863                        break;
864                    }
865                    // alternate between -n-1 and -n but leave -1 alone
866                    if (step < -1) {
867                        ++step;
868                    }
869                } else {/* step is even */
870                    // allow only one UChar output
871
872                    target.limit(target.position() < target.capacity() ? target
873                            .position() + 1 : target.capacity());
874                    if (step == -2) {
875                        source.limit(sourceLen);
876                    } else {
877                        source.limit(source.position() + (-step - 2) / 2);
878                        if (source.limit() > sourceLen) {
879                            source.limit(sourceLen);
880                        }
881                    }
882                    CoderResult cr = decoder.decode(source, target, source
883                            .limit() == sourceLen);
884                    // check pointers and errors
885                    if (cr.isOverflow()) {
886                        // one character has been consumed
887                        if (target.limit() >= target.capacity()) {
888                            // target has reached its limit, an error occurred
889                            logln("Unexpected error: Target Buffer is larger than capacity");
890                            break;
891                        }
892                    } else if (cr.isError()) {
893                        logln("Decoder returned an error code");
894                        logln("ErrorCode expected is: " + cc.outErrorCode);
895                        logln("Error Result is: " + cr.toString());
896                        break;
897                    }
898
899                    --step;
900                }
901            }
902        }
903
904        //--------------------------------------------------------------------------
905
906        cc.toUnicodeResult = target;
907        return target.position();
908    }
909
910
911
912    private boolean checkToUnicode(ConversionCase cc, int resultLength) {
913        return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult);
914    }
915
916
917    private void TestGetUnicodeSet(DataMap testcase) {
918        /*
919         * charset - will be opened, and ucnv_getUnicodeSet() called on it //
920         * map - set of code points and strings that must be in the returned set //
921         * mapnot - set of code points and strings that must *not* be in the //
922         * returned set // which - numeric UConverterUnicodeSet value Headers {
923         * "charset", "map", "mapnot", "which" }
924         */
925
926
927        // retrieve test case data
928        ConversionCase cc = new ConversionCase();
929        CharsetProviderICU provider = new CharsetProviderICU();
930        CharsetICU charset  ;
931
932
933        UnicodeSet mapset = new UnicodeSet();
934        UnicodeSet mapnotset = new UnicodeSet();
935        UnicodeSet unicodeset = new UnicodeSet();
936        String ellipsis = "0x2e";
937        cc.charset = ((ICUResourceBundle) testcase.getObject("charset"))
938                .getString();
939        cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString();
940        cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot"))
941                .getString();
942
943
944        cc.which = ((ICUResourceBundle) testcase.getObject("which")).getInt(); // only checking for ROUNDTRIP_SET
945
946        // Android patch: Skip tests that fail with customized data.
947        String [] testsToSkip = {
948                "HZ",
949                "ibm-1390",
950                "ibm-16684",
951                "ibm-25546",
952                "ibm-971",
953                "ISO-2022-CN",
954                "ISO-2022-JP",
955                "ISO-2022-JP-2",
956                "ISO-2022-KR",
957                "JIS7",
958        };
959        for (int i = 0; i < testsToSkip.length; i++) {
960            if (cc.charset.equals(testsToSkip[i])) {
961                logln("");
962                logln("Skipping: " + cc.charset);
963                logln("...............................................");
964                return;
965            }
966        }
967        // Android patch end.
968
969        // ----for debugging only
970        logln("");
971        logln("TestGetUnicodeSet[" + cc.charset + "] ");
972        logln("...............................................");
973
974        try{
975           // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
976           charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
977                    ? (CharsetICU) provider.charsetForName(cc.charset.substring(1),
978                        "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
979                    : (CharsetICU) provider.charsetForName(cc.charset);
980
981           //checking for converter that are not supported at this point
982           try{
983                if(charset==null ||
984                        charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" ||
985                      charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
986                      charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" ||
987                      charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){
988                    logln("Converter not supported at this point :" + cc.charset);
989                   return;
990               }
991
992               if(cc.which==1){
993                   logln("Fallback set not supported at this point for converter : "+charset.displayName());
994                  return;
995               }
996
997           }catch(Exception e){
998               return;
999           }
1000
1001           mapset.clear();
1002           mapnotset.clear();
1003
1004           mapset.applyPattern(cc.map,false);
1005           mapnotset.applyPattern(cc.mapnot,false);
1006
1007           charset.getUnicodeSet(unicodeset, cc.which);
1008           UnicodeSet diffset = new UnicodeSet();
1009
1010           //are there items that must be in unicodeset but are not?
1011           (diffset = mapset).removeAll(unicodeset);
1012           if(!diffset.isEmpty()){
1013               StringBuffer s = new StringBuffer(diffset.toPattern(true));
1014               if(s.length()>100){
1015                   s.replace(0, 0x7fffffff, ellipsis);
1016               }
1017               errln("error in missing items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString());
1018           }
1019
1020          //are the items that must not be in unicodeset but are?
1021           (diffset=mapnotset).retainAll(unicodeset);
1022           if(!diffset.isEmpty()){
1023               StringBuffer s = new StringBuffer(diffset.toPattern(true));
1024               if(s.length()>100){
1025                   s.replace(0, 0x7fffffff, ellipsis);
1026               }
1027               errln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString());
1028           }
1029         } catch (Exception e) {
1030             errln("getUnicodeSet returned an error code");
1031             errln("ErrorCode expected is: " + cc.outErrorCode);
1032             errln("Error Result is: " + e.toString());
1033             return;
1034         }
1035    }
1036
1037    /**
1038     * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
1039     * start of the stream for example U+FEFF (the Unicode BOM/signature
1040     * character) that can be ignored.
1041     *
1042     * Detects Unicode signature byte sequences at the start of the byte stream
1043     * and returns number of bytes of the BOM of the indicated Unicode charset.
1044     * 0 is returned when no Unicode signature is recognized.
1045     *
1046     */
1047
1048    private String detectUnicodeSignature(ByteBuffer source) {
1049        int signatureLength = 0; // number of bytes of the signature
1050        final int SIG_MAX_LEN = 5;
1051        String sigUniCharset = null; // states what unicode charset is the BOM
1052        int i = 0;
1053
1054        /*
1055         * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
1056         * don't misdetect something
1057         */
1058        byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
1059                (byte) 0xa5 };
1060
1061        while (i < source.limit() && i < SIG_MAX_LEN) {
1062            start[i] = source.get(i);
1063            i++;
1064        }
1065
1066        if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
1067            signatureLength = 2;
1068            sigUniCharset = "UTF-16BE";
1069            source.position(signatureLength);
1070            return sigUniCharset;
1071        } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
1072            if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
1073                signatureLength = 4;
1074                sigUniCharset = "UTF-32LE";
1075                source.position(signatureLength);
1076                return sigUniCharset;
1077            } else {
1078                signatureLength = 2;
1079                sigUniCharset = "UTF-16LE";
1080                source.position(signatureLength);
1081                return sigUniCharset;
1082            }
1083        } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
1084                && start[2] == (byte) 0xBF) {
1085            signatureLength = 3;
1086            sigUniCharset = "UTF-8";
1087            source.position(signatureLength);
1088            return sigUniCharset;
1089        } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
1090                && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
1091            signatureLength = 4;
1092            sigUniCharset = "UTF-32BE";
1093            source.position(signatureLength);
1094            return sigUniCharset;
1095        } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
1096                && start[2] == (byte) 0xFF) {
1097            signatureLength = 3;
1098            sigUniCharset = "SCSU";
1099            source.position(signatureLength);
1100            return sigUniCharset;
1101        } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
1102                && start[2] == (byte) 0x28) {
1103            signatureLength = 3;
1104            sigUniCharset = "BOCU-1";
1105            source.position(signatureLength);
1106            return sigUniCharset;
1107        } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
1108                && start[2] == (byte) 0x76) {
1109
1110            if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
1111                signatureLength = 5;
1112                sigUniCharset = "UTF-7";
1113                source.position(signatureLength);
1114                return sigUniCharset;
1115            } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
1116                    || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
1117                signatureLength = 4;
1118                sigUniCharset = "UTF-7";
1119                source.position(signatureLength);
1120                return sigUniCharset;
1121            }
1122        } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
1123                && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
1124            signatureLength = 4;
1125            sigUniCharset = "UTF-EBCDIC";
1126            source.position(signatureLength);
1127            return sigUniCharset;
1128        }
1129
1130        /* no known Unicode signature byte sequence recognized */
1131        return null;
1132    }
1133
1134    String printbytes(ByteBuffer buf, int pos) {
1135        int cur = buf.position();
1136        String res = " (" + pos + ")==[";
1137        for (int i = 0; i < pos; i++) {
1138            res += "(" + i + ")" + hex(buf.get(i) & 0xff).substring(2) + " ";
1139        }
1140        buf.position(cur);
1141        return res + "]";
1142    }
1143
1144    String printchars(CharBuffer buf, int pos) {
1145        int cur = buf.position();
1146        String res = " (" + pos + ")==[";
1147        for (int i = 0; i < pos; i++) {
1148            res += "(" + i + ")" + hex(buf.get(i)) + " ";
1149        }
1150        buf.position(cur);
1151        return res + "]";
1152    }
1153
1154    private boolean checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected,
1155            ByteBuffer output) {
1156
1157        boolean res = true;
1158        expected.rewind();
1159        output.limit(output.position());
1160        output.rewind();
1161
1162        // remove any BOM signature before checking
1163        if (!cc.charset.contains("UnicodeLittle") && !cc.charset.contains("UnicodeBig")) {
1164            detectUnicodeSignature(output); // sets the position to after the BOM
1165            output = output.slice(); // removes anything before the current position
1166        }
1167
1168        if (output.limit() != expected.limit()) {
1169            errln("Test failed: output length does not match expected for charset: " + cc.charset
1170                    + " [" + cc.caseNr + "]");
1171            res = false;
1172        } else {
1173            while (output.hasRemaining()) {
1174                if (output.get() != expected.get()) {
1175                    errln("Test failed: output does not match expected for charset: " + cc.charset
1176                            + " [" + cc.caseNr + "]");
1177                    res = false;
1178                    break;
1179                }
1180            }
1181        }
1182
1183        if (res) {
1184            logln("[" + cc.caseNr + "]:" + cc.charset);
1185            logln("Input:       " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length()));
1186            logln("Output:      " + printbytes(output, output.limit()));
1187            logln("Expected:    " + printbytes(expected, expected.limit()));
1188            logln("Passed");
1189        }
1190        else {
1191            errln("[" + cc.caseNr + "]:" + cc.charset);
1192            errln("Input:       " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length()));
1193            errln("Output:      " + printbytes(output, output.limit()));
1194            errln("Expected:    " + printbytes(expected, expected.limit()));
1195            errln("Failed");
1196        }
1197        return res;
1198    }
1199
1200    private boolean checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output) {
1201
1202        boolean res = true;
1203        output.limit(output.position());
1204        output.rewind();
1205
1206        // test to see if the conversion matches actual results
1207        if (output.limit() != expected.length()) {
1208            errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
1209            res = false;
1210        } else {
1211            for (int i = 0; i < expected.length(); i++) {
1212                if (output.get(i) != expected.charAt(i)) {
1213                    errln("Test failed: output does not match expected for charset: " + cc.charset
1214                            + " [" + cc.caseNr + "]");
1215                    res = false;
1216                    break;
1217                }
1218            }
1219        }
1220
1221        if (res) {
1222            logln("[" + cc.caseNr + "]:" + cc.charset);
1223            logln("Input:       " + printbytes(cc.bytes, cc.bytes.limit()));
1224            logln("Output:      " + printchars(output, output.limit()));
1225            logln("Expected:    " + printchars(CharBuffer.wrap(expected), expected.length()));
1226            logln("Passed");
1227        } else {
1228            errln("[" + cc.caseNr + "]:" + cc.charset);
1229            errln("Input:       " + printbytes(cc.bytes, cc.bytes.limit()));
1230            errln("Output:      " + printchars(output, output.limit()));
1231            errln("Expected:    " + printchars(CharBuffer.wrap(expected), expected.length()));
1232            errln("Failed");
1233        }
1234        return res;
1235    }
1236
1237    private byte[] toByteArray(String str) {
1238        byte[] ret = new byte[str.length()];
1239        for (int i = 0; i < ret.length; i++) {
1240            char ch = str.charAt(i);
1241            if (ch <= 0xFF) {
1242                ret[i] = (byte) ch;
1243            } else {
1244                throw new IllegalArgumentException(" byte value out of range: " + ch);
1245            }
1246        }
1247        return ret;
1248    }
1249}
1250