1 /*
2  * Copyright (C) 2008-2012  OMRON SOFTWARE Co., Ltd.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package jp.co.omronsoft.openwnn.JAJP;
18 
19 import jp.co.omronsoft.openwnn.*;
20 import java.util.*;
21 
22 /**
23  * The penWnn Clause Converter class for Japanese IME.
24  *
25  * @author Copyright (C) 2009 OMRON SOFTWARE CO., LTD.  All Rights Reserved.
26  */
27 public class OpenWnnClauseConverterJAJP {
28     /** Score(frequency value) of word in the learning dictionary */
29     private static final int FREQ_LEARN = 600;
30     /** Score(frequency value) of word in the user dictionary */
31     private static final int FREQ_USER  = 500;
32 
33     /** Maximum limit length of input */
34     public static final int MAX_INPUT_LENGTH = 50;
35 
36     /** search cache for unique independent words (jiritsugo) */
37     private HashMap<String, ArrayList<WnnWord>> mIndepWordBag;
38     /** search cache for all independent words (jiritsugo) */
39     private HashMap<String, ArrayList<WnnWord>> mAllIndepWordBag;
40     /** search cache for ancillary words (fuzokugo) */
41     private HashMap<String, ArrayList<WnnWord>> mFzkPatterns;
42 
43     /** connect matrix for generating a clause */
44     private byte[][] mConnectMatrix;
45 
46     /** dictionaries */
47     private WnnDictionary mDictionary;
48 
49     /** candidates of conversion */
50     private LinkedList mConvertResult;
51 
52     /** work area for consecutive clause conversion */
53     private WnnSentence[] mSentenceBuffer;
54 
55     /** part of speech (default) */
56     private WnnPOS mPosDefault;
57     /** part of speech (end of clause/not end of sentence) */
58     private WnnPOS mPosEndOfClause1;
59     /** part of speech (end of clause/any place) */
60     private WnnPOS mPosEndOfClause2;
61     /** part of speech (end of sentence) */
62     private WnnPOS mPosEndOfClause3;
63 
64     /** cost value of a clause */
65     private static final int CLAUSE_COST = -1000;
66 
67     /** The candidate filter */
68     private CandidateFilter mFilter = null;
69 
70     /**
71      * Constructor
72      */
OpenWnnClauseConverterJAJP()73     public OpenWnnClauseConverterJAJP() {
74         mIndepWordBag  = new HashMap<String, ArrayList<WnnWord>>();
75         mAllIndepWordBag  = new HashMap<String, ArrayList<WnnWord>>();
76         mFzkPatterns   = new HashMap();
77         mConvertResult = new LinkedList();
78 
79         mSentenceBuffer = new WnnSentence[MAX_INPUT_LENGTH];
80     }
81 
82     /**
83      * Set the dictionary
84      *
85      * @param dict  The dictionary for phrase conversion
86      */
setDictionary(WnnDictionary dict)87     public void setDictionary(WnnDictionary dict) {
88         /* get connect matrix */
89         mConnectMatrix = dict.getConnectMatrix();
90 
91         /* clear dictionary settings */
92         mDictionary = dict;
93         dict.clearDictionary();
94         dict.clearApproxPattern();
95 
96         /* clear work areas */
97         mIndepWordBag.clear();
98         mAllIndepWordBag.clear();
99         mFzkPatterns.clear();
100 
101         /* get part of speech tags */
102         mPosDefault      = dict.getPOS(WnnDictionary.POS_TYPE_MEISI);
103         mPosEndOfClause1 = dict.getPOS(WnnDictionary.POS_TYPE_V1);
104         mPosEndOfClause2 = dict.getPOS(WnnDictionary.POS_TYPE_V2);
105         mPosEndOfClause3 = dict.getPOS(WnnDictionary.POS_TYPE_V3);
106     }
107 
108     /**
109      * Set the candidate filter
110      *
111      * @param filter	The candidate filter
112      */
setFilter(CandidateFilter filter)113     public void setFilter(CandidateFilter filter) {
114     	mFilter = filter;
115     }
116 
117     /**
118      * Kana-to-Kanji conversion (single clause).
119      * <br>
120      * This method execute single clause conversion.
121       *
122      * @param input		The input string
123      * @return			The candidates of conversion; {@code null} if an error occurs.
124      */
convert(String input)125      public Iterator convert(String input) {
126         /* do nothing if no dictionary is specified */
127         if (mConnectMatrix == null || mDictionary == null) {
128             return null;
129         }
130         /* do nothing if the length of input exceeds the limit */
131         if (input.length() > MAX_INPUT_LENGTH) {
132             return null;
133         }
134 
135         /* clear the candidates list */
136         mConvertResult.clear();
137 
138         /* try single clause conversion */
139         if (!singleClauseConvert(mConvertResult, input, mPosEndOfClause2, true)) {
140             return null;
141         }
142         return mConvertResult.iterator();
143     }
144 
145     /**
146      * Consecutive clause conversion.
147      *
148      * @param input		The input string
149      * @return			The result of consecutive clause conversion; {@code null} if fail.
150      */
consecutiveClauseConvert(String input)151     public WnnSentence consecutiveClauseConvert(String input) {
152         LinkedList clauses = new LinkedList();
153 
154         /* clear the cache which is not matched */
155         for (int i = 0; i < input.length(); i++) {
156             mSentenceBuffer[i] = null;
157         }
158         WnnSentence[] sentence = mSentenceBuffer;
159 
160         /* consecutive clause conversion */
161         for (int start = 0; start < input.length(); start++) {
162             if (start != 0 && sentence[start-1] == null) {
163                 continue;
164             }
165 
166             /* limit the length of a clause */
167             int end = input.length();
168             if (end > start + 20) {
169                 end = start + 20;
170             }
171             /* make clauses */
172             for ( ; end > start; end--) {
173                 int idx = end - 1;
174 
175                 /* cutting a branch */
176                 if (sentence[idx] != null) {
177                     if (start != 0) {
178                         if (sentence[idx].frequency > sentence[start-1].frequency + CLAUSE_COST + FREQ_LEARN) {
179                             /* there may be no way to be the best sequence from the 'start' */
180                             break;
181                         }
182                     } else {
183                         if (sentence[idx].frequency > CLAUSE_COST + FREQ_LEARN) {
184                             /* there may be no way to be the best sequence from the 'start' */
185                             break;
186                         }
187                     }
188                 }
189 
190                 String key = input.substring(start, end);
191                 clauses.clear();
192                 WnnClause bestClause = null;
193                 if (end == input.length()) {
194                     /* get the clause which can be the end of the sentence */
195                     singleClauseConvert(clauses, key, mPosEndOfClause1, false);
196                 } else {
197                     /* get the clause which is not the end of the sentence */
198                     singleClauseConvert(clauses, key, mPosEndOfClause3, false);
199                 }
200                 if (clauses.isEmpty()) {
201                     bestClause = defaultClause(key);
202                 } else {
203                     bestClause = (WnnClause)clauses.get(0);
204                 }
205 
206                 /* make a sub-sentence */
207                 WnnSentence ws;
208                 if (start == 0) {
209                     ws = new WnnSentence(key, bestClause);
210                 } else {
211                     ws = new WnnSentence(sentence[start-1], bestClause);
212                 }
213                 ws.frequency += CLAUSE_COST;
214 
215                 /* update the best sub-sentence on the cache buffer */
216                 if (sentence[idx] == null || (sentence[idx].frequency < ws.frequency)) {
217                     sentence[idx] = ws;
218                 }
219             }
220         }
221 
222         /* return the result of the consecutive clause conversion */
223         if (sentence[input.length() - 1] != null) {
224             return sentence[input.length() - 1];
225         }
226         return null;
227     }
228 
229     /**
230      * Consecutive clause conversion.
231      *
232      * @param resultList	Where to store the result
233      * @param input			Input string
234      * @return				{@code true} if success; {@code false} if fail.
235      */
consecutiveClauseConvert(LinkedList resultList, String input)236     private boolean consecutiveClauseConvert(LinkedList resultList, String input) {
237         WnnSentence sentence = consecutiveClauseConvert(input);
238 
239         /* set the result of the consecutive clause conversion on the top of the list */
240         if (sentence != null) {
241             resultList.add(0, sentence);
242             return true;
243         }
244         return false;
245     }
246 
247     /**
248      * Single clause conversion.
249      *
250      * @param clauseList	Where to store the results
251      * @param input			Input string
252      * @param terminal		Part of speech tag at the terminal
253      * @param all			Get all candidates or not
254      * @return				{@code true} if success; {@code false} if fail.
255      */
singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all)256     private boolean singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all) {
257         boolean ret = false;
258 
259         /* get clauses without ancillary word */
260         ArrayList<WnnWord> stems = getIndependentWords(input, all);
261         if (stems != null && (!stems.isEmpty())) {
262             Iterator<WnnWord> stemsi = stems.iterator();
263             while (stemsi.hasNext()) {
264                 WnnWord stem = stemsi.next();
265                 if (addClause(clauseList, input, stem, null, terminal, all)) {
266                     ret = true;
267                 }
268             }
269         }
270 
271         /* get clauses with ancillary word */
272         int max = CLAUSE_COST * 2;
273         for (int split = 1; split < input.length(); split++) {
274             /* get ancillary patterns */
275             String str = input.substring(split);
276             ArrayList<WnnWord> fzks = getAncillaryPattern(str);
277             if (fzks == null || fzks.isEmpty()) {
278                 continue;
279             }
280 
281             /* get candidates of stem in a clause */
282             str = input.substring(0, split);
283             stems = getIndependentWords(str, all);
284             if (stems == null || stems.isEmpty()) {
285                 if (mDictionary.searchWord(WnnDictionary.SEARCH_PREFIX, WnnDictionary.ORDER_BY_FREQUENCY, str) <= 0) {
286                     break;
287                 } else {
288                     continue;
289                 }
290             }
291             /* make clauses */
292             Iterator<WnnWord> stemsi = stems.iterator();
293             while (stemsi.hasNext()) {
294                 WnnWord stem = stemsi.next();
295                 if (all || stem.frequency > max) {
296                     Iterator<WnnWord> fzksi  = fzks.iterator();
297                     while (fzksi.hasNext()) {
298                         WnnWord fzk = fzksi.next();
299                         if (addClause(clauseList, input, stem, fzk, terminal, all)) {
300                             ret = true;
301                             max = stem.frequency;
302                         }
303                     }
304                 }
305             }
306         }
307         return ret;
308     }
309 
310     /**
311      * Add valid clause to the candidates list.
312      *
313      * @param clauseList	Where to store the results
314      * @param input			Input string
315      * @param stem			Stem of the clause (a independent word)
316      * @param fzk			Ancillary pattern
317      * @param terminal		Part of speech tag at the terminal
318      * @param all			Get all candidates or not
319      * @return				{@code true} if add the clause to the list; {@code false} if not.
320      */
addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk, WnnPOS terminal, boolean all)321     private boolean addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk,
322                               WnnPOS terminal, boolean all) {
323         WnnClause clause = null;
324         /* check if the part of speech is valid */
325         if (fzk == null) {
326             if (connectible(stem.partOfSpeech.right, terminal.left)) {
327                 clause = new WnnClause(input, stem);
328             }
329         } else {
330             if (connectible(stem.partOfSpeech.right, fzk.partOfSpeech.left)
331                 && connectible(fzk.partOfSpeech.right, terminal.left)) {
332                 clause = new WnnClause(input, stem, fzk);
333             }
334         }
335         if (clause == null) {
336             return false;
337         }
338         if (mFilter != null && !mFilter.isAllowed(clause)) {
339         	return false;
340         }
341 
342         /* store to the list */
343         if (clauseList.isEmpty()) {
344             /* add if the list is empty */
345             clauseList.add(0, clause);
346             return true;
347         } else {
348             if (!all) {
349                 /* reserve only the best clause */
350                 WnnClause best = (WnnClause)clauseList.get(0);
351                 if (best.frequency < clause.frequency) {
352                     clauseList.set(0, clause);
353                     return true;
354                 }
355             } else {
356                 /* reserve all clauses */
357                 Iterator clauseListi = clauseList.iterator();
358                 int index = 0;
359                 while (clauseListi.hasNext()) {
360                     WnnClause clausei = (WnnClause)clauseListi.next();
361                     if (clausei.frequency < clause.frequency) {
362                         break;
363                     }
364                     index++;
365                 }
366                 clauseList.add(index, clause);
367                 return true;
368             }
369         }
370 
371         return false;
372     }
373 
374     /**
375      * Check the part-of-speeches are connectable.
376      *
377      * @param right		Right attribute of the preceding word/clause
378      * @param left		Left attribute of the following word/clause
379      * @return			{@code true} if there are connectable; {@code false} if otherwise
380      */
connectible(int right, int left)381     private boolean connectible(int right, int left) {
382         try {
383             if (mConnectMatrix[left][right] != 0) {
384                 return true;
385             }
386         } catch (Exception ex) {
387         }
388         return false;
389     }
390 
391     /**
392      * Get all exact matched ancillary words(Fuzokugo) list.
393      *
394      * @param input		Search key
395      * @return			List of ancillary words
396      */
getAncillaryPattern(String input)397     private ArrayList<WnnWord> getAncillaryPattern(String input) {
398         if (input.length() == 0) {
399             return null;
400         }
401 
402         HashMap<String,ArrayList<WnnWord>> fzkPat = mFzkPatterns;
403         ArrayList<WnnWord> fzks = fzkPat.get(input);
404         if (fzks != null) {
405             return fzks;
406         }
407 
408         /* set dictionaries */
409         WnnDictionary dict = mDictionary;
410         dict.clearDictionary();
411         dict.clearApproxPattern();
412         dict.setDictionary(6, 400, 500);
413 
414         for (int start = input.length() - 1; start >= 0; start--) {
415             String key = input.substring(start);
416 
417             fzks = fzkPat.get(key);
418             if (fzks != null) {
419                 continue;
420             }
421 
422             fzks = new ArrayList<WnnWord>();
423             mFzkPatterns.put(key, fzks);
424 
425             /* search ancillary words */
426             dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, key);
427             WnnWord word;
428             while ((word = dict.getNextWord()) != null) {
429                 fzks.add(word);
430             }
431 
432             /* concatenate sequence of ancillary words */
433             for (int end = input.length() - 1; end > start; end--) {
434                 ArrayList<WnnWord> followFzks = fzkPat.get(input.substring(end));
435                 if (followFzks == null ||  followFzks.isEmpty()) {
436                     continue;
437                 }
438                 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input.substring(start, end));
439                 while ((word = dict.getNextWord()) != null) {
440                     Iterator<WnnWord> followFzksi = followFzks.iterator();
441                     while (followFzksi.hasNext()) {
442                         WnnWord follow = followFzksi.next();
443                         if (connectible(word.partOfSpeech.right, follow.partOfSpeech.left)) {
444                             fzks.add(new WnnWord(key, key, new WnnPOS(word.partOfSpeech.left, follow.partOfSpeech.right)));
445                         }
446                     }
447                 }
448             }
449         }
450         return fzks;
451     }
452 
453     /**
454      * Get all exact matched independent words(Jiritsugo) list.
455      *
456      * @param input    Search key
457      * @param all      {@code true} if list all words; {@code false} if list words which has an unique part of speech tag.
458      * @return			List of words; {@code null} if {@code input.length() == 0}.
459      */
getIndependentWords(String input, boolean all)460     private ArrayList<WnnWord> getIndependentWords(String input, boolean all) {
461         if (input.length() == 0) {
462             return null;
463         }
464 
465         ArrayList<WnnWord> words = (all)? mAllIndepWordBag.get(input) : mIndepWordBag.get(input);
466 
467         if (words == null) {
468             /* set dictionaries */
469             WnnDictionary dict = mDictionary;
470             dict.clearDictionary();
471             dict.clearApproxPattern();
472             dict.setDictionary(4, 0, 10);
473             dict.setDictionary(5, 400, 500);
474             dict.setDictionary(WnnDictionary.INDEX_USER_DICTIONARY, FREQ_USER, FREQ_USER);
475             dict.setDictionary(WnnDictionary.INDEX_LEARN_DICTIONARY, FREQ_LEARN, FREQ_LEARN);
476 
477             words = new ArrayList<WnnWord>();
478             WnnWord word;
479             if (all) {
480             	mAllIndepWordBag.put(input, words);
481                 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
482                 /* store all words */
483                 while ((word = dict.getNextWord()) != null) {
484                     if (input.equals(word.stroke)) {
485                         words.add(word);
486                     }
487                 }
488             } else {
489             	mIndepWordBag.put(input, words);
490                 dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
491                 /* store a word which has an unique part of speech tag */
492                 while ((word = dict.getNextWord()) != null) {
493                     if (input.equals(word.stroke)) {
494                         Iterator<WnnWord> list = words.iterator();
495                         boolean found = false;
496                         while (list.hasNext()) {
497                             WnnWord w = (WnnWord)list.next();
498                                 if (w.partOfSpeech.right == word.partOfSpeech.right) {
499                                     found = true;
500                                     break;
501                                 }
502                         }
503                         if (!found) {
504                             words.add(word);
505                         }
506                         if (word.frequency < 400) {
507                             break;
508                         }
509                     }
510                 }
511             }
512             addAutoGeneratedCandidates(input, words, all);
513         }
514         return words;
515     }
516 
517     /**
518      * Add some words not including in the dictionary.
519      * <br>
520      * This method adds some words which are not in the dictionary.
521      *
522      * @param input     Input string
523      * @param wordList  List to store words
524      * @param all       Get all candidates or not
525      */
addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all)526     private void addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all) {
527         wordList.add(new WnnWord(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
528     }
529 
530     /**
531      * Get a default clause.
532      * <br>
533      * This method generates a clause which has a string same as input
534      * and the default part-of-speech tag.
535      *
536      * @param input    Input string
537      * @return			Default clause
538      */
defaultClause(String input)539     private WnnClause defaultClause(String input) {
540         return (new WnnClause(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
541     }
542 }