1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ***********************************************************************
5  * Copyright (C) 2005, International Business Machines Corporation and *
6  * others. All Rights Reserved.                                        *
7  ***********************************************************************
8  *
9  */
10 
11 package com.ibm.icu.dev.tool.charsetdet.sbcs;
12 
13 import com.ibm.icu.text.UnicodeSet;
14 
15 /**
16  * @author emader
17  *
18  * TODO To change the template for this generated type comment go to
19  * Window - Preferences - Java - Code Style - Code Templates
20  */
21 public class NGramParser
22 {
23 
24     public interface NGramParserClient
25     {
nextChar()26         char nextChar();
handleNGram(String key)27         void handleNGram(String key);
28     }
29 
30     private static final int A_NULL = 0;
31     private static final int A_ADDC = 1;
32     private static final int A_ADDS = 2;
33 
34     /*
35      * Character classes
36      */
37     public static final int C_IGNORE = 0;
38     public static final int C_LETTER = 1;
39     public static final int C_PUNCT  = 2;
40 
41     private static final int S_START  = 0;
42     private static final int S_LETTER = 1;
43     private static final int S_PUNCT  = 2;
44 
45     static final class StateEntry
46     {
47         private int newState;
48         private int action;
49 
StateEntry(int theState, int theAction)50         StateEntry(int theState, int theAction)
51         {
52             newState = theState;
53             action   = theAction;
54         }
55 
getNewState()56         public int getNewState()
57         {
58             return newState;
59         }
60 
getAction()61         public int getAction()
62         {
63             return action;
64         }
65     }
66 
67     private StateEntry[][] stateTable = {
68             {new StateEntry(S_START,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
69             {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
70             {new StateEntry(S_PUNCT,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_NULL)}
71     };
72 
73     protected final int N_GRAM_SIZE = 3;
74 
75     private char[] letters = new char[N_GRAM_SIZE];
76     private int letterCount;
77 
78     private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
79 
80     private NGramParserClient client;
81 
82     /**
83      *
84      */
NGramParser(NGramParserClient theClient)85     public NGramParser(NGramParserClient theClient)
86     {
87         client = theClient;
88         letterCount = 0;
89     }
90 
setClient(NGramParserClient theClient)91     public void setClient(NGramParserClient theClient)
92     {
93         client = theClient;
94     }
95 
96     // TODO Is this good enough, or are there other C_IGNORE characters?
97     // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
getCharClass(char ch)98     public static int getCharClass(char ch)
99     {
100         if (ch == '\'' || ch == '\uFEFF') {
101             return C_IGNORE;
102         }
103 
104         if (letterSet.contains(ch)) {
105             return C_LETTER;
106         }
107 
108         return C_PUNCT;
109     }
110 
reset()111     public void reset()
112     {
113         letterCount = 0;
114     }
115 
addLetter(char letter)116     public void addLetter(char letter)
117     {
118         // somewhat clever stuff goes here...
119         letters[letterCount++] = letter;
120 
121         if (letterCount >= N_GRAM_SIZE) {
122             String key = new String(letters);
123 
124             client.handleNGram(key);
125 
126             letterCount = N_GRAM_SIZE - 1;
127             for (int i = 0; i < letterCount; i += 1) {
128                 letters[i] = letters[i + 1];
129             }
130         }
131     }
132 
parse()133     public void parse()
134     {
135         char ch;
136         int state = 0;
137 
138         // this is where the clever stuff goes...
139         while ((ch = client.nextChar()) != 0) {
140             int charClass = getCharClass(ch);
141             StateEntry entry = stateTable[state][charClass];
142 
143             state = entry.getNewState();
144 
145             switch (entry.getAction())
146             {
147             case A_ADDC:
148                 addLetter(Character.toLowerCase(ch));
149                 break;
150 
151             case A_ADDS:
152                 addLetter(' ');
153                 break;
154 
155             case A_NULL:
156             default:
157                 break;
158             }
159         }
160 
161         addLetter(' ');
162     }
163 }
164