1 /* 2 *********************************************************************** 3 * Copyright (C) 2005, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 *********************************************************************** 6 * 7 */ 8 9 package com.ibm.icu.dev.tool.charsetdet.sbcs; 10 11 import com.ibm.icu.text.UnicodeSet; 12 13 /** 14 * @author emader 15 * 16 * TODO To change the template for this generated type comment go to 17 * Window - Preferences - Java - Code Style - Code Templates 18 */ 19 public class NGramParser 20 { 21 22 public interface NGramParserClient 23 { nextChar()24 char nextChar(); handleNGram(String key)25 void handleNGram(String key); 26 } 27 28 private static final int A_NULL = 0; 29 private static final int A_ADDC = 1; 30 private static final int A_ADDS = 2; 31 32 /* 33 * Character classes 34 */ 35 public static final int C_IGNORE = 0; 36 public static final int C_LETTER = 1; 37 public static final int C_PUNCT = 2; 38 39 private static final int S_START = 0; 40 private static final int S_LETTER = 1; 41 private static final int S_PUNCT = 2; 42 43 static final class StateEntry 44 { 45 private int newState; 46 private int action; 47 StateEntry(int theState, int theAction)48 StateEntry(int theState, int theAction) 49 { 50 newState = theState; 51 action = theAction; 52 } 53 getNewState()54 public int getNewState() 55 { 56 return newState; 57 } 58 getAction()59 public int getAction() 60 { 61 return action; 62 } 63 } 64 65 private StateEntry[][] stateTable = { 66 {new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)}, 67 {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)}, 68 {new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)} 69 }; 70 71 protected final int N_GRAM_SIZE = 3; 72 73 private char[] letters = new char[N_GRAM_SIZE]; 74 private int letterCount; 75 76 private static UnicodeSet letterSet = new UnicodeSet("[:letter:]"); 77 78 private NGramParserClient client; 79 80 /** 81 * 82 */ NGramParser(NGramParserClient theClient)83 public NGramParser(NGramParserClient theClient) 84 { 85 client = theClient; 86 letterCount = 0; 87 } 88 setClient(NGramParserClient theClient)89 public void setClient(NGramParserClient theClient) 90 { 91 client = theClient; 92 } 93 94 // TODO Is this good enough, or are there other C_IGNORE characters? 95 // TODO Could this make Latin letters C_PUNCT for non-Latin scripts? getCharClass(char ch)96 public static int getCharClass(char ch) 97 { 98 if (ch == '\'' || ch == '\uFEFF') { 99 return C_IGNORE; 100 } 101 102 if (letterSet.contains(ch)) { 103 return C_LETTER; 104 } 105 106 return C_PUNCT; 107 } 108 reset()109 public void reset() 110 { 111 letterCount = 0; 112 } 113 addLetter(char letter)114 public void addLetter(char letter) 115 { 116 // somewhat clever stuff goes here... 117 letters[letterCount++] = letter; 118 119 if (letterCount >= N_GRAM_SIZE) { 120 String key = new String(letters); 121 122 client.handleNGram(key); 123 124 letterCount = N_GRAM_SIZE - 1; 125 for (int i = 0; i < letterCount; i += 1) { 126 letters[i] = letters[i + 1]; 127 } 128 } 129 } 130 parse()131 public void parse() 132 { 133 char ch; 134 int state = 0; 135 136 // this is where the clever stuff goes... 137 while ((ch = client.nextChar()) != 0) { 138 int charClass = getCharClass(ch); 139 StateEntry entry = stateTable[state][charClass]; 140 141 state = entry.getNewState(); 142 143 switch (entry.getAction()) 144 { 145 case A_ADDC: 146 addLetter(Character.toLowerCase(ch)); 147 break; 148 149 case A_ADDS: 150 addLetter(' '); 151 break; 152 153 case A_NULL: 154 default: 155 break; 156 } 157 } 158 159 addLetter(' '); 160 } 161 } 162