1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 ***********************************************************************
5 * Copyright (C) 2005-2006, International Business Machines            *
6 * Corporation and others. All Rights Reserved.                        *
7 ***********************************************************************
8 *
9 */
10
11package com.ibm.icu.dev.tool.charsetdet.sbcs;
12
13/**
14 * @author emader
15 *
16 * TODO To change the template for this generated type comment go to
17 * Window - Preferences - Java - Code Style - Code Templates
18 */
19public class Checker implements NGramParser.NGramParserClient
20{
21    private NGramList ngrams;
22    private int totalNGrams;
23    private int totalHits;
24
25    private String language;
26    private String encoding;
27
28    private int[] histogram;
29
30    private static final int BUFFER_SIZE = 1024;
31
32    private char[] buffer;
33    private int bufIndex;
34    private int bufMax;
35
36    private NGramParser parser;
37
38    /**
39     * TODO This should take cumulative percent and the name...
40     */
41    public Checker(NGramList list, InputFile dataFile)
42    {
43        ngrams = list;
44        ngrams.setMapper(dataFile);
45
46        language = languageName(dataFile.getFilename());
47        encoding = dataFile.getEncoding();
48
49        buffer = new char[BUFFER_SIZE];
50        parser = new NGramParser(this);
51        resetCounts();
52
53        histogram = new int[100];
54        resetHistogram();
55   }
56
57    public void handleNGram(String key)
58    {
59        NGramList.NGram ngram = ngrams.get(key);
60
61        totalNGrams += 1;
62
63        if (ngram != null) {
64            totalHits += 1;
65            //ngram.incrementRefCount();
66        }
67    }
68
69    private void resetCounts()
70    {
71        bufIndex = 0;
72        totalNGrams = totalHits = 0;
73    }
74
75    private void resetHistogram()
76    {
77        for(int i = 0; i < 100; i += 1) {
78            histogram[i] = 0;
79        }
80
81    }
82
83    private static void exceptionError(Exception e)
84    {
85        System.err.println("ioError: " + e.toString());
86    }
87
88    private static String languageName(String filename)
89    {
90        return filename.substring(0, filename.indexOf('.'));
91    }
92
93    private boolean nextBuffer(InputFile inputFile)
94    {
95        try {
96            bufMax = inputFile.read(buffer);
97        } catch (Exception e) {
98            bufMax = -1;
99            exceptionError(e);
100
101            return false;
102        }
103
104        bufIndex = 0;
105
106        return bufMax >= 0;
107    }
108
109    private void parseBuffer()
110    {
111        resetCounts();
112        parser.reset();
113        parser.parse();
114    }
115
116    public char nextChar()
117    {
118        if (bufIndex >= bufMax) {
119            return 0;
120        }
121
122        return buffer[bufIndex++];
123    }
124
125    public String getLanguage()
126    {
127        return language;
128    }
129
130    public void setMapper(InputFile file)
131    {
132        ngrams.setMapper(file);
133    }
134
135    public int checkBuffer(char[] theBuffer, int charCount)
136    {
137        buffer = theBuffer;
138        bufMax = charCount;
139
140        parseBuffer();
141
142        return totalHits;
143    }
144
145    public void check(InputFile dataFile)
146    {
147        int minHist = 101, maxHist = -1;
148
149        dataFile.open();
150
151        String dataFilename = dataFile.getFilename();
152        String fileEncoding = dataFile.getEncoding();
153
154        System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");
155
156        setMapper(dataFile);
157        resetHistogram();
158
159        while (nextBuffer(dataFile)) {
160            parseBuffer();
161
162            double percentHits = (double) totalHits / totalNGrams * 100.0;
163            int ph = (int) percentHits;
164
165            if (ph < minHist) {
166                minHist = ph;
167            }
168
169            if (ph > maxHist) {
170                maxHist = ph;
171            }
172
173            histogram[ph] += 1;
174        }
175
176        for(int ph = minHist; ph <= maxHist; ph += 1) {
177            System.out.println(ph + "\t" + histogram[ph]);
178        }
179
180        System.out.println();
181
182        dataFile.close();
183
184        return;
185    }
186}
187