12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 3bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert/* 4bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *********************************************************************** 5bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 6bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * Copyright (C) 2005-2012, International Business Machines Corporation and 7bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * others. All Rights Reserved. 8bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 9bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert *********************************************************************** 10bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 11bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * euc_tool 12bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 13bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * This tool produces the character usage frequency statistics for the EUC family 14bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * of charsets, for use by the ICU charset detectors. 15bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 16bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * usage: java euc_tool [-d] [directory path] 17bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 18bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * -d: Produce the data in a form to be exported to the ICU implementation 19bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * Default is to produce an informative dump. 20bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 21bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * directory path 22bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * Source directory for the files to be analyzed. 23bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * Default is the current directory. 24bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * There should be three subdirectories under the specified directory, one 25bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * each for EUC_JP, EUC_CN and EUC_KR. Within each of these subdirectories 26bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * should be text files in the specified encoding. 27bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert * 28bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert */ 29bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 30bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertpackage com.ibm.icu.dev.tool.charsetdet.mbcs; 31bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 32bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.io.File; 33bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.io.FileInputStream; 34bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.ArrayList; 35bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.Arrays; 36bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.HashMap; 37bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertimport java.util.List; 38bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 39bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubertpublic class EUCTool { 40bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 41bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // The file buffer and file data length need to be out in class member variables 42bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // so that the code lifted from charSet detection for scanning the multi-byte chars 43bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // can see them conveniently. 44bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert byte [] buf = new byte[1000000]; 45bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int fileSize; 46bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 47bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean option_d = false; // data option. Produce exportable data 48bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean option_v = true; // verbose informaional output. 49bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 50bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 51bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 52bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert public static void main(String[] args) { 53bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert EUCTool This = new EUCTool(); 54bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert This.Main(args); 55bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 56bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 57bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 58bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 59bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert void Main(String[] args) { 60bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int i; 61bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 62bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 63bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Command Line Option Handling 64bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 65bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert String dirName = "."; 66bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert for (i=0; i<args.length; i++) { 67bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (args[i].equals("-d")) { 68bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert option_d = true; 69bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert option_v = false; 70bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert continue; 71bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 72bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (args[i].startsWith("-")) { 73bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.err.println("Unrecongized option: " + args[i]); 74bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.exit(-1); 75bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 76bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert dirName = args[i]; 77bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 78bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 79bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 80bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Verify that the specified directory exists. 81bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 82bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert File dir = new File(dirName); 83bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (dir.isDirectory() == false) { 84bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.err.println("\"" + dirName + "\" is not a directory"); 85bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.exit(-1); 86bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 87bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 88bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 89bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Do each subdirectory of the specified directory. There should be 90bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // one per each encoding - euc-kr, euc-cn, euc-jp 91bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 92bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert File[] dirs = dir.listFiles(); 93bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert for (i=0; i<dirs.length; i++) { 94bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (dirs[i].isDirectory()) { 95bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert String nam = dirs[i].getName(); 96bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (nam.equalsIgnoreCase("CVS")) { 97bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert continue; 98bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 99bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert processDir(dirs[i]); 100bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 101bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 102bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 103bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 104bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 105bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Collect statistics from all ordinary files in a specified directory. 106bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 107bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert void processDir(File dir) { 108bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int totalMbcsChars = 0; 109bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert HashMap m = new HashMap(10000); 110bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int i; 111bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 112bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(dir.getName()); 113bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert File[] files = dir.listFiles(); 114bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert for (i=0; i<files.length; i++) { 115bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert FileInputStream is = null; 116bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert try { 117bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (files[i].isFile()) { 118bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert is = new FileInputStream(files[i]); 119bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert fileSize = is.read(buf); 120bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (option_v) { 121bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(files[i].getPath()); 122bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(" " + fileSize + " bytes."); 123bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 124bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert iteratedChar ichar = new iteratedChar(); 125bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int fileChars = 0; 126bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int fileMbcsChars = 0; 127bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int errs = 0; 128bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 129bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert while (nextChar(ichar)) { 130bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (ichar.error == true) { 131bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert errs++; 132bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert continue; 133bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 134bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert fileChars++; 135bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (ichar.charValue > 255) { 136bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert fileMbcsChars++; 137bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert totalMbcsChars++; 138bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 139bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (ichar.charValue <= 255) { 140bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Don't keep occurence statistics for the single byte range 141bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert continue; 142bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 143bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 144bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 145bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Frequency of occurence statistics are accumulated in a map. 146bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 147bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl keyEl = new ChEl(ichar.charValue, 0); 148bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl valEl = (ChEl)m.get(keyEl); 149bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (valEl == null) { 150bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert m.put(keyEl, keyEl); 151bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert valEl = keyEl; 152bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 153bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert valEl.occurences++; 154bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 155bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (option_v) { 156bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(" " + fileChars + " Chars"); 157bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(" " + fileMbcsChars + " mbcs Chars"); 158bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(" " + errs + " errors"); 159bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println("\n"); 160bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 161bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 162bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 163bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert catch (Exception e) { 164bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.err.println("Exception:" + e); 165bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 166bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 167bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert finally { 168bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (is != null) { 169bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert try { 170bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert is.close(); 171bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } catch (Exception e) { 172bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // ignore 173bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 174bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 175bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 176bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 177bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 178bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 179bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // We've processed through all of the files. 180bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // sort and dump out the frequency statistics. 181bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 182bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert Object [] encounteredChars = m.values().toArray(); 183bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert Arrays.sort(encounteredChars); 184bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int cumulativeChars = 0; 185bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int cumulativePercent = 0; 186bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (option_v) { 187bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println("# <char code> <occurences> <Cumulative %>"); 188bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert for (i=0; i<encounteredChars.length; i++) { 189bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl c = (ChEl)encounteredChars[i]; 190bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert cumulativeChars += c.occurences; 191bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert cumulativePercent = cumulativeChars*100/totalMbcsChars; 192bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println(i + " " + Integer.toHexString(c.charCode) + " " 193bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert + c.occurences + " " + cumulativePercent); 194bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 195bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 196bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (option_d) { 197bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 198bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Output the list of characters formatted for pasting into a 199bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Java source code array initializer. 200bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Resort into order based on the character code value, not 201bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // on frequency of occurence. 202bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 203bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert List charList = new ArrayList(); 204bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 205bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert for (i=0; i<100 && cumulativePercent<50; i++) { 206bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl c = (ChEl)encounteredChars[i]; 207bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert cumulativeChars += c.occurences; 208bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert cumulativePercent = cumulativeChars*100/totalMbcsChars; 209bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert charList.add(new Integer(c.charCode)); 210bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 211bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert Object [] sortedChars = charList.toArray(); 212bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert Arrays.sort(sortedChars); 213bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 214bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.print(" {"); 215bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert for (i=0; i<sortedChars.length; i++) { 216bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (i != 0) { 217bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.print(", "); 218bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if ((i)%10 == 0) { 219bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.print("\n "); 220bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 221bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 222bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int cp = ((Integer)sortedChars[i]).intValue(); 223bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.print("0x" + Integer.toHexString(cp)); 224bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 225bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println("};"); 226bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 227bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 228bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 229bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 230bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // This is a little class containing a 231bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // multi-byte character value and an occurence count for that char. 232bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Instances of this class are kept in the collection that accumulates statistics 233bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 234bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // WARNING: this class's natural ordering (from Comparable) and equals() 235bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // are inconsistent. 236bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 237bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert static class ChEl implements Comparable { 238bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int charCode; 239bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int occurences; 240bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 241bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl(int c, int o) { 242bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert charCode = c; 243bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert occurences = o; 244bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 245bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 246bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Equals needs to work with a map, with the charCode as the key. 247bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // For insertion/lookup, we care about the char code only, not the occurence count. 248bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert public boolean equals(Object other) { 249bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl o = (ChEl)other; 250bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return o.charCode == this.charCode; 251bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 252bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 253bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Hashcode needs to be compatible with equals 254bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // We're using this in a hashMap! 255bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert public int hashCode() { 256bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return charCode; 257bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 258bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 259bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // We want to be able to sort the results by frequency of occurence 260bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Compare backwards. We want most frequent chars first. 261bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert public int compareTo(Object other) { 262bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert ChEl o = (ChEl)other; 263bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return (this.occurences> o.occurences? -1 : 264bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert (this.occurences==o.occurences? 0 : 1)); 265bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 266bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 267bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 268bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 269bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 270bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs 271bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Pulls out one logical char according to the rules of EUC encoding. 272bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // 273bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert class iteratedChar { 274bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int charValue = 0; // The char value is a value from the encoding. 275bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // It's meaning is not well defined, other than 276bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // different encodings 277bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int index = 0; 278bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int nextIndex = 0; 279bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean error = false; 280bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean done = false; 281bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 282bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert void reset() { 283bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert charValue = 0; 284bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert index = -1; 285bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert nextIndex = 0; 286bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert error = false; 287bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert done = false; 288bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 289bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 290bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int nextByte() { 291bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (nextIndex >= fileSize) { 292bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert done = true; 293bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return -1; 294bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 295bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int byteValue = (int)buf[nextIndex++] & 0x00ff; 296bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return byteValue; 297bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 298bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 299bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 300bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 301bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert boolean nextChar(iteratedChar it) { 302bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.index = it.nextIndex; 303bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.error = false; 304bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int firstByte = 0; 305bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int secondByte = 0; 306bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int thirdByte = 0; 307bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert int fourthByte = 0; 308bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 309bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert buildChar: { 310bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert firstByte = it.charValue = it.nextByte(); 311bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (firstByte < 0) { 312bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Ran off the end of the input data 313bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.done = true; 314bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert break buildChar; 315bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 316bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (firstByte <= 0x8d) { 317bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // single byte char 318bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert break buildChar; 319bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 320bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 321bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert secondByte = it.nextByte(); 322bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.charValue = (it.charValue << 8) | secondByte; 323bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 324bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (firstByte >= 0xA1 && firstByte <= 0xfe) { 325bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Two byte Char 326bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (secondByte < 0xa1) { 327bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.error = true; 328bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 329bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert break buildChar; 330bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 331bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (firstByte == 0x8e) { 332bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Code Set 2. 333bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 334bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 335bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // We don't know which we've got. 336bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Treat it like EUC-JP. If the data really was EUC-TW, the following two 337bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // bytes will look like a well formed 2 byte char. 338bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (secondByte < 0xa1) { 339bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.error = true; 340bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 341bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert break buildChar; 342bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 343bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 344bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (firstByte == 0x8f) { 345bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Code set 3. 346bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert // Three byte total char size, two bytes of actual char value. 347bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert thirdByte = it.nextByte(); 348bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.charValue = (it.charValue << 8) | thirdByte; 349bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (thirdByte < 0xa1) { 350bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert it.error = true; 351bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 352bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 353bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 354bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 355bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert if (it.error) { 356bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte) 357bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert + " " + Integer.toHexString(thirdByte) + " " + Integer.toHexString(fourthByte)); 358bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 359bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert return (it.done == false); 360bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert } 361bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert} 362bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 363bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 364bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 365bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7Fredrik Roubert 366