// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* *********************************************************************** * * Copyright (C) 2006-2012, International Business Machines Corporation and * others. All Rights Reserved. * *********************************************************************** * * BIG5Tool * * This tool produces the character usage frequency statistics for the Big5 * Chinese charset, for use by the ICU charset detectors. * * usage: java BIG5Tool [-d] [directory path] * * -d: Produce the data in a form to be exported to the ICU implementation * Default is to produce an informative dump. * * -sjis Do Shift_JIS. The structure of sjis is very similar to Big5. * * directory path * Source directory for the text files to be analyzed. * All files in the specified directory must be in the Big5 encoding. * */ package com.ibm.icu.dev.tool.charsetdet.mbcs; import java.io.File; import java.io.FileInputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; public class BIG5Tool { // The file buffer and file data length need to be out in class member variables // so that the code lifted from charSet detection for scanning the multi-byte chars // can see them conveniently. byte [] buf = new byte[1000000]; int fileSize; boolean option_d = false; // data option. Produce exportable data boolean option_v = true; // verbose informaional output. boolean sjis = false; // True if input text files are Shift_JIS encoded. public static void main(String[] args) { BIG5Tool This = new BIG5Tool(); This.Main(args); } void Main(String[] args) { int i; // // Command Line Option Handling // String dirName = null; for (i=0; i 255) { fileMbcsChars++; totalMbcsChars++; } if (ichar.charValue <= 255) { // Don't keep occurence statistics for the single byte range continue; } // // Frequency of occurence statistics are accumulated in a map. // ChEl keyEl = new ChEl(ichar.charValue, 0); ChEl valEl = (ChEl)m.get(keyEl); if (valEl == null) { m.put(keyEl, keyEl); valEl = keyEl; } valEl.occurences++; } if (option_v) { System.out.println(" " + fileChars + " Chars"); System.out.println(" " + fileMbcsChars + " mbcs Chars"); System.out.println(" " + errs + " errors"); System.out.println("\n"); } } } catch (Exception e) { System.err.println("Exception:" + e); } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore } } } } // // We've processed through all of the files. // sort and dump out the frequency statistics. // Object [] encounteredChars = m.values().toArray(); Arrays.sort(encounteredChars); int cumulativeChars = 0; int cumulativePercent = 0; if (option_v) { System.out.println("# "); for (i=0; i o.occurences? -1 : (this.occurences==o.occurences? 0 : 1)); } } // // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs // Pulls out one logical char according to the rules of EUC encoding. // class iteratedChar { int charValue = 0; // The char value is a value from the encoding. // It's meaning is not well defined, other than // different encodings int index = 0; int nextIndex = 0; boolean error = false; boolean done = false; void reset() { charValue = 0; index = -1; nextIndex = 0; error = false; done = false; } int nextByte() { if (nextIndex >= fileSize) { done = true; return -1; } int byteValue = (int)buf[nextIndex++] & 0x00ff; return byteValue; } } boolean nextChar(iteratedChar it) { it.index = it.nextIndex; it.error = false; int firstByte = 0; int secondByte = 0; buildChar: { firstByte = it.charValue = it.nextByte(); if (firstByte < 0) { // Ran off the end of the input data it.done = true; break buildChar; } if (firstByte <= 0x0080 || (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) || (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) { // single byte char break buildChar; } secondByte = it.nextByte(); it.charValue = (it.charValue << 8) | secondByte; if (secondByte < 0x40 || secondByte == 0x007f || secondByte == 0x00ff || sjis && secondByte >= 0x00fd) { it.error = true; } if (it.error) { System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte)); } } return (it.done == false); } }