12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* others. All Rights Reserved.                                                *
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           This is a superclass for the individual detectors for
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           each of the detectable members of the ISO 2022 family
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           of encodings.
162d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert *
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *                           The separate classes are nested within this class.
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertabstract class CharsetRecog_2022 extends CharsetRecognizer {
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
212d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Matching function shared among the 2022 detectors JP, CN and KR
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Counts up the number of legal an unrecognized escape sequences in
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the sample of text, and computes a score based on the total number &
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * the proportion that fit the encoding.
272d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert     *
282d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert     *
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param text the byte buffer containing text to analyse
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param textLen  the size of the text in the byte.
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @param escapeSequences the byte escape sequences to test for.
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return match quality, in the range of 0-100.
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     i, j;
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     escN;
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     hits   = 0;
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     misses = 0;
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     shifts = 0;
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int     quality;
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        scanInput:
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (i=0; i<textLen; i++) {
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (text[i] == 0x1b) {
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    checkEscapes:
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        for (escN=0; escN<escapeSequences.length; escN++) {
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            byte [] seq = escapeSequences[escN];
472d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            if ((textLen - i) < seq.length) {
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                continue checkEscapes;
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
512d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            for (j=1; j<seq.length; j++) {
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                if (seq[j] != text[i+j])  {
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                    continue checkEscapes;
552d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert                                }
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
572d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
582d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert                            hits++;
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            i += seq.length-1;
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            continue scanInput;
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        }
622d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
632d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert                        misses++;
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
652d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (text[i] == 0x0e || text[i] == 0x0f) {
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Shift in/out
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    shifts++;
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
712d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (hits == 0) {
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return 0;
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
752d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Initial quality is based on relative proportion of recongized vs.
782d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        //   unrecognized escape sequences.
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   All good:  quality = 100;
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   half or less good: quality = 0;
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   linear inbetween.
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        quality = (100*hits - 100*misses) / (hits + misses);
832d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Back off quality if there were too few escape sequences seen.
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   Include shifts in this computation, so that KR does not get penalized
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   for having only a single Escape sequence, but many shifts.
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (hits+shifts < 5) {
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            quality -= (5-(hits+shifts))*10;
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
902d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (quality < 0) {
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            quality = 0;
932d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        }
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return quality;
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
972d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
982d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
992d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private byte [] [] escapeSequences = {
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x41},         // GB 2312-80
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x42},         // JIS X 208-1983
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x42},         // ASCII
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x48},         // JIS-Roman
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x49},         // Half-width katakana
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x28, 0x4a},         // JIS-Roman
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x2e, 0x41},         // ISO 8859-1
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x2e, 0x46}          // ISO 8859-7
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                };
1152d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1162d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        @Override
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String getName() {
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return "ISO-2022-JP";
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1202d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1212d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        @Override
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsetMatch   match(CharsetDetector det) {
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private byte [] [] escapeSequences = {
1302d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert                {0x1b, 0x24, 0x29, 0x43}
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                 };
1322d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1332d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        @Override
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String getName() {
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return "ISO-2022-KR";
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1372d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1382d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        @Override
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsetMatch   match(CharsetDetector det) {
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private byte [] [] escapeSequences = {
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x4e},               // SS2
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                {0x1b, 0x4f},               // SS3
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        };
1592d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1602d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        @Override
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String getName() {
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return "ISO-2022-CN";
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1642d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1652d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        @Override
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsetMatch   match(CharsetDetector det) {
1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1712d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
174