1 /*
2  ***********************************************************************
3  * Copyright (C) 2005, International Business Machines Corporation and *
4  * others. All Rights Reserved.                                        *
5  ***********************************************************************
6  *
7  */
8 
9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
10 
11 import com.ibm.icu.text.UnicodeSet;
12 
13 /**
14  * @author emader
15  *
16  * TODO To change the template for this generated type comment go to
17  * Window - Preferences - Java - Code Style - Code Templates
18  */
19 public class NGramParser
20 {
21 
22     public interface NGramParserClient
23     {
nextChar()24         char nextChar();
handleNGram(String key)25         void handleNGram(String key);
26     }
27 
28     private static final int A_NULL = 0;
29     private static final int A_ADDC = 1;
30     private static final int A_ADDS = 2;
31 
32     /*
33      * Character classes
34      */
35     public static final int C_IGNORE = 0;
36     public static final int C_LETTER = 1;
37     public static final int C_PUNCT  = 2;
38 
39     private static final int S_START  = 0;
40     private static final int S_LETTER = 1;
41     private static final int S_PUNCT  = 2;
42 
43     static final class StateEntry
44     {
45         private int newState;
46         private int action;
47 
StateEntry(int theState, int theAction)48         StateEntry(int theState, int theAction)
49         {
50             newState = theState;
51             action   = theAction;
52         }
53 
getNewState()54         public int getNewState()
55         {
56             return newState;
57         }
58 
getAction()59         public int getAction()
60         {
61             return action;
62         }
63     }
64 
65     private StateEntry[][] stateTable = {
66             {new StateEntry(S_START,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
67             {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
68             {new StateEntry(S_PUNCT,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_NULL)}
69     };
70 
71     protected final int N_GRAM_SIZE = 3;
72 
73     private char[] letters = new char[N_GRAM_SIZE];
74     private int letterCount;
75 
76     private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
77 
78     private NGramParserClient client;
79 
80     /**
81      *
82      */
NGramParser(NGramParserClient theClient)83     public NGramParser(NGramParserClient theClient)
84     {
85         client = theClient;
86         letterCount = 0;
87     }
88 
setClient(NGramParserClient theClient)89     public void setClient(NGramParserClient theClient)
90     {
91         client = theClient;
92     }
93 
94     // TODO Is this good enough, or are there other C_IGNORE characters?
95     // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
getCharClass(char ch)96     public static int getCharClass(char ch)
97     {
98         if (ch == '\'' || ch == '\uFEFF') {
99             return C_IGNORE;
100         }
101 
102         if (letterSet.contains(ch)) {
103             return C_LETTER;
104         }
105 
106         return C_PUNCT;
107     }
108 
reset()109     public void reset()
110     {
111         letterCount = 0;
112     }
113 
addLetter(char letter)114     public void addLetter(char letter)
115     {
116         // somewhat clever stuff goes here...
117         letters[letterCount++] = letter;
118 
119         if (letterCount >= N_GRAM_SIZE) {
120             String key = new String(letters);
121 
122             client.handleNGram(key);
123 
124             letterCount = N_GRAM_SIZE - 1;
125             for (int i = 0; i < letterCount; i += 1) {
126                 letters[i] = letters[i + 1];
127             }
128         }
129     }
130 
parse()131     public void parse()
132     {
133         char ch;
134         int state = 0;
135 
136         // this is where the clever stuff goes...
137         while ((ch = client.nextChar()) != 0) {
138             int charClass = getCharClass(ch);
139             StateEntry entry = stateTable[state][charClass];
140 
141             state = entry.getNewState();
142 
143             switch (entry.getAction())
144             {
145             case A_ADDC:
146                 addLetter(Character.toLowerCase(ch));
147                 break;
148 
149             case A_ADDS:
150                 addLetter(' ');
151                 break;
152 
153             case A_NULL:
154             default:
155                 break;
156             }
157         }
158 
159         addLetter(' ');
160     }
161 }
162