1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 *********************************************************************** 5 * Copyright (C) 2005-2006, International Business Machines * 6 * Corporation and others. All Rights Reserved. * 7 *********************************************************************** 8 * 9 */ 10 11 package com.ibm.icu.dev.tool.charsetdet.sbcs; 12 13 /** 14 * @author emader 15 * 16 * TODO To change the template for this generated type comment go to 17 * Window - Preferences - Java - Code Style - Code Templates 18 */ 19 public class Checker implements NGramParser.NGramParserClient 20 { 21 private NGramList ngrams; 22 private int totalNGrams; 23 private int totalHits; 24 25 private String language; 26 private String encoding; 27 28 private int[] histogram; 29 30 private static final int BUFFER_SIZE = 1024; 31 32 private char[] buffer; 33 private int bufIndex; 34 private int bufMax; 35 36 private NGramParser parser; 37 38 /** 39 * TODO This should take cumulative percent and the name... 40 */ Checker(NGramList list, InputFile dataFile)41 public Checker(NGramList list, InputFile dataFile) 42 { 43 ngrams = list; 44 ngrams.setMapper(dataFile); 45 46 language = languageName(dataFile.getFilename()); 47 encoding = dataFile.getEncoding(); 48 49 buffer = new char[BUFFER_SIZE]; 50 parser = new NGramParser(this); 51 resetCounts(); 52 53 histogram = new int[100]; 54 resetHistogram(); 55 } 56 handleNGram(String key)57 public void handleNGram(String key) 58 { 59 NGramList.NGram ngram = ngrams.get(key); 60 61 totalNGrams += 1; 62 63 if (ngram != null) { 64 totalHits += 1; 65 //ngram.incrementRefCount(); 66 } 67 } 68 resetCounts()69 private void resetCounts() 70 { 71 bufIndex = 0; 72 totalNGrams = totalHits = 0; 73 } 74 resetHistogram()75 private void resetHistogram() 76 { 77 for(int i = 0; i < 100; i += 1) { 78 histogram[i] = 0; 79 } 80 81 } 82 exceptionError(Exception e)83 private static void exceptionError(Exception e) 84 { 85 System.err.println("ioError: " + e.toString()); 86 } 87 languageName(String filename)88 private static String languageName(String filename) 89 { 90 return filename.substring(0, filename.indexOf('.')); 91 } 92 nextBuffer(InputFile inputFile)93 private boolean nextBuffer(InputFile inputFile) 94 { 95 try { 96 bufMax = inputFile.read(buffer); 97 } catch (Exception e) { 98 bufMax = -1; 99 exceptionError(e); 100 101 return false; 102 } 103 104 bufIndex = 0; 105 106 return bufMax >= 0; 107 } 108 parseBuffer()109 private void parseBuffer() 110 { 111 resetCounts(); 112 parser.reset(); 113 parser.parse(); 114 } 115 nextChar()116 public char nextChar() 117 { 118 if (bufIndex >= bufMax) { 119 return 0; 120 } 121 122 return buffer[bufIndex++]; 123 } 124 getLanguage()125 public String getLanguage() 126 { 127 return language; 128 } 129 setMapper(InputFile file)130 public void setMapper(InputFile file) 131 { 132 ngrams.setMapper(file); 133 } 134 checkBuffer(char[] theBuffer, int charCount)135 public int checkBuffer(char[] theBuffer, int charCount) 136 { 137 buffer = theBuffer; 138 bufMax = charCount; 139 140 parseBuffer(); 141 142 return totalHits; 143 } 144 check(InputFile dataFile)145 public void check(InputFile dataFile) 146 { 147 int minHist = 101, maxHist = -1; 148 149 dataFile.open(); 150 151 String dataFilename = dataFile.getFilename(); 152 String fileEncoding = dataFile.getEncoding(); 153 154 System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:"); 155 156 setMapper(dataFile); 157 resetHistogram(); 158 159 while (nextBuffer(dataFile)) { 160 parseBuffer(); 161 162 double percentHits = (double) totalHits / totalNGrams * 100.0; 163 int ph = (int) percentHits; 164 165 if (ph < minHist) { 166 minHist = ph; 167 } 168 169 if (ph > maxHist) { 170 maxHist = ph; 171 } 172 173 histogram[ph] += 1; 174 } 175 176 for(int ph = minHist; ph <= maxHist; ph += 1) { 177 System.out.println(ph + "\t" + histogram[ph]); 178 } 179 180 System.out.println(); 181 182 dataFile.close(); 183 184 return; 185 } 186 } 187