1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ***********************************************************************
5  * Copyright (C) 2005-2006, International Business Machines            *
6  * Corporation and others. All Rights Reserved.                        *
7  ***********************************************************************
8  *
9  */
10 
11 package com.ibm.icu.dev.tool.charsetdet.sbcs;
12 
13 /**
14  * @author emader
15  *
16  * TODO To change the template for this generated type comment go to
17  * Window - Preferences - Java - Code Style - Code Templates
18  */
19 public class Checker implements NGramParser.NGramParserClient
20 {
21     private NGramList ngrams;
22     private int totalNGrams;
23     private int totalHits;
24 
25     private String language;
26     private String encoding;
27 
28     private int[] histogram;
29 
30     private static final int BUFFER_SIZE = 1024;
31 
32     private char[] buffer;
33     private int bufIndex;
34     private int bufMax;
35 
36     private NGramParser parser;
37 
38     /**
39      * TODO This should take cumulative percent and the name...
40      */
Checker(NGramList list, InputFile dataFile)41     public Checker(NGramList list, InputFile dataFile)
42     {
43         ngrams = list;
44         ngrams.setMapper(dataFile);
45 
46         language = languageName(dataFile.getFilename());
47         encoding = dataFile.getEncoding();
48 
49         buffer = new char[BUFFER_SIZE];
50         parser = new NGramParser(this);
51         resetCounts();
52 
53         histogram = new int[100];
54         resetHistogram();
55    }
56 
handleNGram(String key)57     public void handleNGram(String key)
58     {
59         NGramList.NGram ngram = ngrams.get(key);
60 
61         totalNGrams += 1;
62 
63         if (ngram != null) {
64             totalHits += 1;
65             //ngram.incrementRefCount();
66         }
67     }
68 
resetCounts()69     private void resetCounts()
70     {
71         bufIndex = 0;
72         totalNGrams = totalHits = 0;
73     }
74 
resetHistogram()75     private void resetHistogram()
76     {
77         for(int i = 0; i < 100; i += 1) {
78             histogram[i] = 0;
79         }
80 
81     }
82 
exceptionError(Exception e)83     private static void exceptionError(Exception e)
84     {
85         System.err.println("ioError: " + e.toString());
86     }
87 
languageName(String filename)88     private static String languageName(String filename)
89     {
90         return filename.substring(0, filename.indexOf('.'));
91     }
92 
nextBuffer(InputFile inputFile)93     private boolean nextBuffer(InputFile inputFile)
94     {
95         try {
96             bufMax = inputFile.read(buffer);
97         } catch (Exception e) {
98             bufMax = -1;
99             exceptionError(e);
100 
101             return false;
102         }
103 
104         bufIndex = 0;
105 
106         return bufMax >= 0;
107     }
108 
parseBuffer()109     private void parseBuffer()
110     {
111         resetCounts();
112         parser.reset();
113         parser.parse();
114     }
115 
nextChar()116     public char nextChar()
117     {
118         if (bufIndex >= bufMax) {
119             return 0;
120         }
121 
122         return buffer[bufIndex++];
123     }
124 
getLanguage()125     public String getLanguage()
126     {
127         return language;
128     }
129 
setMapper(InputFile file)130     public void setMapper(InputFile file)
131     {
132         ngrams.setMapper(file);
133     }
134 
checkBuffer(char[] theBuffer, int charCount)135     public int checkBuffer(char[] theBuffer, int charCount)
136     {
137         buffer = theBuffer;
138         bufMax = charCount;
139 
140         parseBuffer();
141 
142         return totalHits;
143     }
144 
check(InputFile dataFile)145     public void check(InputFile dataFile)
146     {
147         int minHist = 101, maxHist = -1;
148 
149         dataFile.open();
150 
151         String dataFilename = dataFile.getFilename();
152         String fileEncoding = dataFile.getEncoding();
153 
154         System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");
155 
156         setMapper(dataFile);
157         resetHistogram();
158 
159         while (nextBuffer(dataFile)) {
160             parseBuffer();
161 
162             double percentHits = (double) totalHits / totalNGrams * 100.0;
163             int ph = (int) percentHits;
164 
165             if (ph < minHist) {
166                 minHist = ph;
167             }
168 
169             if (ph > maxHist) {
170                 maxHist = ph;
171             }
172 
173             histogram[ph] += 1;
174         }
175 
176         for(int ph = minHist; ph <= maxHist; ph += 1) {
177             System.out.println(ph + "\t" + histogram[ph]);
178         }
179 
180         System.out.println();
181 
182         dataFile.close();
183 
184         return;
185     }
186 }
187