12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
3bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert/*
4bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ***********************************************************************
5bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
6bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * Copyright (C) 2005-2012, International Business Machines Corporation and
7bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * others. All Rights Reserved.
8bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
9bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ***********************************************************************
10bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
11bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * euc_tool
12bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
13bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *    This tool produces the character usage frequency statistics for the EUC family
14bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *    of charsets, for use by the ICU charset detectors.
15bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
16bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *    usage:  java euc_tool [-d] [directory path]
17bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
18bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *        -d:   Produce the data in a form to be exported to the ICU implementation
19bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *              Default is to produce an informative dump.
20bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
21bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *        directory path
22bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *              Source directory for the files to be analyzed.
23bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *              Default is the current directory.
24bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *              There should be three subdirectories under the specified directory, one
25bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *              each for EUC_JP, EUC_CN and EUC_KR.  Within each of these subdirectories
26bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *              should be text files in the specified encoding.
27bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *
28bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert */
29bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
30bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertpackage com.ibm.icu.dev.tool.charsetdet.mbcs;
31bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
32bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.io.File;
33bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.io.FileInputStream;
34bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.ArrayList;
35bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.Arrays;
36bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.HashMap;
37bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.List;
38bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
39bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertpublic class EUCTool {
40bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
41bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    // The file buffer and file data length need to be out in class member variables
42bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //  so that the code lifted from charSet detection for scanning the multi-byte chars
43bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //  can see them conveniently.
44bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    byte []    buf = new byte[1000000];
45bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    int        fileSize;
46bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
47bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    boolean    option_d = false;    // data option.  Produce exportable data
48bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    boolean    option_v = true;     // verbose informaional output.
49bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
50bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
51bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
52bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    public static void main(String[] args) {
53bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        EUCTool  This = new EUCTool();
54bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        This.Main(args);
55bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    }
56bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
57bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
58bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
59bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    void Main(String[] args) {
60bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int i;
61bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
62bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
63bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //   Command Line Option Handling
64bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
65bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        String     dirName  = ".";
66bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        for (i=0; i<args.length; i++) {
67bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (args[i].equals("-d")) {
68bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                option_d = true;
69bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                option_v = false;
70bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                continue;
71bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
72bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (args[i].startsWith("-")) {
73bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                System.err.println("Unrecongized option: " + args[i]);
74bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                System.exit(-1);
75bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
76bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            dirName = args[i];
77bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
78bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
79bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
80bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //  Verify that the specified directory exists.
81bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
82bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        File dir = new File(dirName);
83bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        if (dir.isDirectory() == false) {
84bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            System.err.println("\"" + dirName + "\" is not a directory");
85bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            System.exit(-1);
86bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
87bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
88bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
89bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //  Do each subdirectory of the specified directory.  There should be
90bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //    one per each encoding - euc-kr, euc-cn, euc-jp
91bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
92bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        File[] dirs  = dir.listFiles();
93bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        for (i=0; i<dirs.length; i++) {
94bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (dirs[i].isDirectory()) {
95bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                String nam = dirs[i].getName();
96bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (nam.equalsIgnoreCase("CVS")) {
97bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    continue;
98bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
99bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                processDir(dirs[i]);
100bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
101bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
102bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    }
103bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
104bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //
105bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    // Collect statistics from all ordinary files in a specified directory.
106bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //
107bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    void processDir(File dir) {
108bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int      totalMbcsChars  = 0;
109bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        HashMap  m = new HashMap(10000);
110bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int      i;
111bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
112bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        System.out.println(dir.getName());
113bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        File[] files = dir.listFiles();
114bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        for (i=0; i<files.length; i++) {
115bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            FileInputStream is = null;
116bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            try {
117bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (files[i].isFile()) {
118bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    is = new FileInputStream(files[i]);
119bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    fileSize = is.read(buf);
120bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    if (option_v) {
121bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.println(files[i].getPath());
122bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.println("  " + fileSize + " bytes.");
123bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    }
124bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    iteratedChar ichar = new iteratedChar();
125bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    int fileChars     = 0;
126bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    int fileMbcsChars = 0;
127bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    int errs          = 0;
128bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
129bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    while (nextChar(ichar)) {
130bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        if (ichar.error == true) {
131bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            errs++;
132bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            continue;
133bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        }
134bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        fileChars++;
135bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        if (ichar.charValue > 255) {
136bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            fileMbcsChars++;
137bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            totalMbcsChars++;
138bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        }
139bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        if (ichar.charValue <= 255) {
140bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            // Don't keep occurence statistics for the single byte range
141bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            continue;
142bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        }
143bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
144bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        //
145bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        //  Frequency of occurence statistics are accumulated in a map.
146bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        //
147bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        ChEl  keyEl = new ChEl(ichar.charValue, 0);
148bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        ChEl  valEl = (ChEl)m.get(keyEl);
149bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        if (valEl == null) {
150bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            m.put(keyEl, keyEl);
151bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                            valEl = keyEl;
152bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        }
153bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        valEl.occurences++;
154bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    }
155bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    if (option_v) {
156bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.println("  " + fileChars     + " Chars");
157bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.println("  " + fileMbcsChars + " mbcs Chars");
158bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.println("  " + errs          + " errors");
159bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.println("\n");
160bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    }
161bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
162bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
163bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            catch (Exception e) {
164bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                System.err.println("Exception:" + e);
165bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
166bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
167bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            finally {
168bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (is != null) {
169bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    try {
170bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        is.close();
171bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    } catch (Exception e) {
172bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        // ignore
173bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    }
174bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
175bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
176bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
177bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
178bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
179bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //  We've processed through all of the files.
180bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //     sort and dump out the frequency statistics.
181bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //
182bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        Object [] encounteredChars = m.values().toArray();
183bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        Arrays.sort(encounteredChars);
184bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int cumulativeChars = 0;
185bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int cumulativePercent = 0;
186bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        if (option_v) {
187bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            System.out.println("# <char code> <occurences>  <Cumulative %>");
188bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            for (i=0; i<encounteredChars.length; i++) {
189bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                ChEl c = (ChEl)encounteredChars[i];
190bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                cumulativeChars += c.occurences;
191bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                cumulativePercent = cumulativeChars*100/totalMbcsChars;
192bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                System.out.println(i + "   " + Integer.toHexString(c.charCode) + "        "
193bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        + c.occurences + "         " + cumulativePercent);
194bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
195bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
196bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        if (option_d) {
197bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            //
198bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            //   Output the list of characters formatted for pasting into a
199bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            //     Java source code array initializer.
200bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            //     Resort into order based on the character code value, not
201bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            //      on frequency of occurence.
202bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            //
203bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            List  charList = new ArrayList();
204bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
205bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            for (i=0; i<100 && cumulativePercent<50; i++) {
206bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                ChEl c = (ChEl)encounteredChars[i];
207bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                cumulativeChars += c.occurences;
208bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                cumulativePercent = cumulativeChars*100/totalMbcsChars;
209bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                charList.add(new Integer(c.charCode));
210bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
211bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            Object [] sortedChars = charList.toArray();
212bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            Arrays.sort(sortedChars);
213bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
214bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            System.out.print("          {");
215bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            for (i=0; i<sortedChars.length; i++) {
216bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (i != 0) {
217bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    System.out.print(", ");
218bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    if ((i)%10 == 0) {
219bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                        System.out.print("\n           ");
220bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    }
221bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
222bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                int cp = ((Integer)sortedChars[i]).intValue();
223bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                System.out.print("0x" + Integer.toHexString(cp));
224bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
225bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            System.out.println("};");
226bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
227bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    }
228bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
229bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //
230bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //  This is a little class containing a
231bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //    multi-byte character value and an occurence count for that char.
232bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //  Instances of this class are kept in the collection that accumulates statistics
233bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //
234bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //  WARNING:  this class's natural ordering (from Comparable) and equals()
235bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //            are inconsistent.
236bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
237bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    static class ChEl implements Comparable {
238bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int charCode;
239bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int occurences;
240bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
241bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        ChEl(int c, int o) {
242bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            charCode = c;
243bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            occurences = o;
244bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
245bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
246bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        // Equals needs to work with a map, with the charCode as the key.
247bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //   For insertion/lookup, we care about the char code only, not the occurence count.
248bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        public boolean equals(Object other) {
249bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            ChEl o = (ChEl)other;
250bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            return o.charCode == this.charCode;
251bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
252bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
253bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        // Hashcode needs to be compatible with equals
254bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //   We're using this in a hashMap!
255bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        public int hashCode() {
256bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            return charCode;
257bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
258bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
259bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        // We want to be able to sort the results by frequency of occurence
260bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        //   Compare backwards.  We want most frequent chars first.
261bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        public int compareTo(Object other) {
262bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            ChEl o = (ChEl)other;
263bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            return (this.occurences> o.occurences? -1 :
264bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                   (this.occurences==o.occurences?  0 : 1));
265bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
266bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
267bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    }
268bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
269bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //
270bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
271bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //              Pulls out one logical char according to the rules of EUC encoding.
272bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    //
273bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    class iteratedChar {
274bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int             charValue = 0;             // The char value is a value from the encoding.
275bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                                                   //   It's meaning is not well defined, other than
276bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                                                   //   different encodings
277bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int             index     = 0;
278bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int             nextIndex = 0;
279bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        boolean         error     = false;
280bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        boolean         done      = false;
281bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
282bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        void reset() {
283bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            charValue = 0;
284bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            index     = -1;
285bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            nextIndex = 0;
286bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            error     = false;
287bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            done      = false;
288bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
289bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
290bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int nextByte() {
291bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (nextIndex >= fileSize) {
292bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                done = true;
293bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                return -1;
294bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
295bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            int byteValue = (int)buf[nextIndex++] & 0x00ff;
296bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            return byteValue;
297bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
298bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    }
299bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
300bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
301bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    boolean nextChar(iteratedChar it) {
302bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        it.index = it.nextIndex;
303bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        it.error = false;
304bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int firstByte  = 0;
305bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int secondByte = 0;
306bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int thirdByte  = 0;
307bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        int fourthByte = 0;
308bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
309bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        buildChar: {
310bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            firstByte = it.charValue = it.nextByte();
311bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (firstByte < 0) {
312bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // Ran off the end of the input data
313bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                it.done = true;
314bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                break buildChar;
315bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
316bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (firstByte <= 0x8d) {
317bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // single byte char
318bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                break buildChar;
319bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
320bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
321bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            secondByte = it.nextByte();
322bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            it.charValue = (it.charValue << 8) | secondByte;
323bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
324bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (firstByte >= 0xA1 && firstByte <= 0xfe) {
325bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // Two byte Char
326bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (secondByte < 0xa1) {
327bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    it.error = true;
328bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
329bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                break buildChar;
330bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
331bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (firstByte == 0x8e) {
332bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // Code Set 2.
333bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
334bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
335bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // We don't know which we've got.
336bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
337bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                //   bytes will look like a well formed 2 byte char.
338bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (secondByte < 0xa1) {
339bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    it.error = true;
340bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
341bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                break buildChar;
342bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
343bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
344bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            if (firstByte == 0x8f) {
345bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // Code set 3.
346bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                // Three byte total char size, two bytes of actual char value.
347bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                thirdByte    = it.nextByte();
348bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                it.charValue = (it.charValue << 8) | thirdByte;
349bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                if (thirdByte < 0xa1) {
350bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    it.error = true;
351bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                }
352bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            }
353bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
354bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
355bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        if (it.error) {
356bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert            System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte)
357bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert                    + " " +  Integer.toHexString(thirdByte) + " " + Integer.toHexString(fourthByte));
358bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        }
359bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert        return (it.done == false);
360bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert    }
361bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert}
362bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
363bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
364bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
365bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert
366