17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* others. All Rights Reserved.                                                *
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           This is a superclass for the individual detectors for
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           each of the detectable members of the ISO 2022 family
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           of encodings.
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           The separate classes are nested within this class.
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertabstract class CharsetRecog_2022 extends CharsetRecognizer {
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Matching function shared among the 2022 detectors JP, CN and KR
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Counts up the number of legal an unrecognized escape sequences in
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the sample of text, and computes a score based on the total number &
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the proportion that fit the encoding.
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param text the byte buffer containing text to analyse
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param textLen  the size of the text in the byte.
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param escapeSequences the byte escape sequences to test for.
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return match quality, in the range of 0-100.
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     i, j;
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     escN;
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     hits   = 0;
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     misses = 0;
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     shifts = 0;
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     quality;
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        scanInput:
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (i=0; i<textLen; i++) {
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (text[i] == 0x1b) {
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    checkEscapes:
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        for (escN=0; escN<escapeSequences.length; escN++) {
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            byte [] seq = escapeSequences[escN];
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            if ((textLen - i) < seq.length) {
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                continue checkEscapes;
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            for (j=1; j<seq.length; j++) {
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                if (seq[j] != text[i+j])  {
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                    continue checkEscapes;
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                }
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            hits++;
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            i += seq.length-1;
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            continue scanInput;
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        }
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        misses++;
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (text[i] == 0x0e || text[i] == 0x0f) {
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Shift in/out
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    shifts++;
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (hits == 0) {
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return 0;
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Initial quality is based on relative proportion of recongized vs.
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   unrecognized escape sequences.
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   All good:  quality = 100;
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   half or less good: quality = 0;
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   linear inbetween.
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        quality = (100*hits - 100*misses) / (hits + misses);
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Back off quality if there were too few escape sequences seen.
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   Include shifts in this computation, so that KR does not get penalized
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   for having only a single Escape sequence, but many shifts.
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (hits+shifts < 5) {
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            quality -= (5-(hits+shifts))*10;
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (quality < 0) {
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            quality = 0;
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return quality;
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private byte [] [] escapeSequences = {
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x41},         // GB 2312-80
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x42},         // JIS X 208-1983
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x42},         // ASCII
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x48},         // JIS-Roman
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x49},         // Half-width katakana
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x4a},         // JIS-Roman
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x2e, 0x41},         // ISO 8859-1
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x2e, 0x46}          // ISO 8859-7
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                };
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String getName() {
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return "ISO-2022-JP";
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsetMatch   match(CharsetDetector det) {
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private byte [] [] escapeSequences = {
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x43}
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                 };
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String getName() {
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return "ISO-2022-KR";
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsetMatch   match(CharsetDetector det) {
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private byte [] [] escapeSequences = {
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x4e},               // SS2
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x4f},               // SS3
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        };
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String getName() {
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return "ISO-2022-CN";
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsetMatch   match(CharsetDetector det) {
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
166