12ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/* GENERATED SOURCE. DO NOT MODIFY. */
2f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// © 2016 and later: Unicode, Inc. and others.
3f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
42ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/*
52ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller*******************************************************************************
62ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
72ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller* others. All Rights Reserved.                                                *
82ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller*******************************************************************************
92ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller*/
102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerpackage android.icu.text;
112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller/**
132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                           This is a superclass for the individual detectors for
152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                           each of the detectable members of the ISO 2022 family
162ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                           of encodings.
17f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert *
182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller *                           The separate classes are nested within this class.
192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller */
202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fullerabstract class CharsetRecog_2022 extends CharsetRecognizer {
212ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
22f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    /**
242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * Matching function shared among the 2022 detectors JP, CN and KR
252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * Counts up the number of legal an unrecognized escape sequences in
262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * the sample of text, and computes a score based on the total number &
272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * the proportion that fit the encoding.
28f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert     *
29f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert     *
302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @param text the byte buffer containing text to analyse
312ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @param textLen  the size of the text in the byte.
322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @param escapeSequences the byte escape sequences to test for.
332ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     * @return match quality, in the range of 0-100.
342ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller     */
352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int     i, j;
372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int     escN;
382ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int     hits   = 0;
392ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int     misses = 0;
402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int     shifts = 0;
412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        int     quality;
422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        scanInput:
432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            for (i=0; i<textLen; i++) {
442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                if (text[i] == 0x1b) {
452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    checkEscapes:
462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                        for (escN=0; escN<escapeSequences.length; escN++) {
472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            byte [] seq = escapeSequences[escN];
48f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            if ((textLen - i) < seq.length) {
502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                                continue checkEscapes;
512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            }
52f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            for (j=1; j<seq.length; j++) {
542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                                if (seq[j] != text[i+j])  {
552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                                    continue checkEscapes;
56f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                                }
572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            }
58f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
59f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                            hits++;
602ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            i += seq.length-1;
612ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                            continue scanInput;
622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                        }
63f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
64f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                        misses++;
652ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
66f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                if (text[i] == 0x0e || text[i] == 0x0f) {
682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    // Shift in/out
692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                    shifts++;
702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                }
712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            }
72f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        if (hits == 0) {
742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return 0;
752ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
76f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
772ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        //
782ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        // Initial quality is based on relative proportion of recongized vs.
79f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        //   unrecognized escape sequences.
802ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        //   All good:  quality = 100;
812ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        //   half or less good: quality = 0;
822ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        //   linear inbetween.
832ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        quality = (100*hits - 100*misses) / (hits + misses);
84f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
852ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        // Back off quality if there were too few escape sequences seen.
862ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        //   Include shifts in this computation, so that KR does not get penalized
872ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        //   for having only a single Escape sequence, but many shifts.
882ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        if (hits+shifts < 5) {
892ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            quality -= (5-(hits+shifts))*10;
902ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
91f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
922ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        if (quality < 0) {
932ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            quality = 0;
94f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        }
952ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        return quality;
962ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    }
972ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
98f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
99f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
100f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1012ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
1022ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        private byte [] [] escapeSequences = {
1032ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
1042ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
1052ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
1062ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x41},         // GB 2312-80
1072ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x42},         // JIS X 208-1983
1082ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
1092ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x28, 0x42},         // ASCII
1102ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x28, 0x48},         // JIS-Roman
1112ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x28, 0x49},         // Half-width katakana
1122ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x28, 0x4a},         // JIS-Roman
1132ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x2e, 0x41},         // ISO 8859-1
1142ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x2e, 0x46}          // ISO 8859-7
1152ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                };
116f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
117f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        @Override
1182ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        String getName() {
1192ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return "ISO-2022-JP";
1202ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
121f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
122f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        @Override
1232ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        CharsetMatch   match(CharsetDetector det) {
1242ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
1252ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1262ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
1272ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    }
1282ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
1292ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
1302ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        private byte [] [] escapeSequences = {
131f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert                {0x1b, 0x24, 0x29, 0x43}
1322ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                 };
133f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
134f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        @Override
1352ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        String getName() {
1362ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return "ISO-2022-KR";
1372ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
138f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
139f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        @Override
1402ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        CharsetMatch   match(CharsetDetector det) {
1412ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
1422ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1432ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
1442ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    }
1452ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
1462ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
1472ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        private byte [] [] escapeSequences = {
1482ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
1492ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
1502ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
1512ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
1522ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
1532ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
1542ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
1552ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
1562ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
1572ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x4e},               // SS2
1582ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller                {0x1b, 0x4f},               // SS3
1592ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        };
160f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
161f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        @Override
1622ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        String getName() {
1632ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return "ISO-2022-CN";
1642ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
165f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
166f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert        @Override
1672ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        CharsetMatch   match(CharsetDetector det) {
1682ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
1692ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
1702ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller        }
1712ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller    }
172f86f25d102340da66b9c7cb6b2d5ecdc0de43ecfFredrik Roubert
1732ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller}
1742ae130017183d2f66d55bf0ca51f8da3294644fdNeil Fuller
175