1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.Constants;
21 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
22 
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collections;
26 import java.util.Iterator;
27 import java.util.LinkedList;
28 
29 /**
30  * A dictionary that can fusion heads and tails of words for more compression.
31  */
32 @UsedForTesting
33 public final class FusionDictionary implements Iterable<WordProperty> {
34     private static final boolean DBG = MakedictLog.DBG;
35 
36     private static int CHARACTER_NOT_FOUND_INDEX = -1;
37 
38     /**
39      * A node array of the dictionary, containing several PtNodes.
40      *
41      * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
42      * real information.
43      * This class also contains fields to cache size and address, to help with binary
44      * generation.
45      */
46     public static final class PtNodeArray {
47         ArrayList<PtNode> mData;
48         // To help with binary generation
49         int mCachedSize = Integer.MIN_VALUE;
50         // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
51         // always hold the same value except between dictionary address compression, during which
52         // the update process needs to know about both values at the same time. Updating will
53         // update the AfterUpdate value, and the code will move them to BeforeUpdate before
54         // the next update pass.
55         int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
56         int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
57         int mCachedParentAddress = 0;
58 
PtNodeArray()59         public PtNodeArray() {
60             mData = new ArrayList<>();
61         }
PtNodeArray(ArrayList<PtNode> data)62         public PtNodeArray(ArrayList<PtNode> data) {
63             Collections.sort(data, PTNODE_COMPARATOR);
64             mData = data;
65         }
66     }
67 
68     /**
69      * PtNode is a group of characters, with probability information, shortcut targets, bigrams,
70      * and children (Pt means Patricia Trie).
71      *
72      * This is the central class of the in-memory representation. A PtNode is what can
73      * be seen as a traditional "trie node", except it can hold several characters at the
74      * same time. A PtNode essentially represents one or several characters in the middle
75      * of the trie tree; as such, it can be a terminal, and it can have children.
76      * In this in-memory representation, whether the PtNode is a terminal or not is represented
77      * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the
78      * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null
79      * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present,
80      * are non-null.
81      */
82     public static final class PtNode {
83         private static final int NOT_A_TERMINAL = -1;
84         final int mChars[];
85         ArrayList<WeightedString> mShortcutTargets;
86         ArrayList<WeightedString> mBigrams;
87         // null == mProbabilityInfo indicates this is not a terminal.
88         ProbabilityInfo mProbabilityInfo;
89         int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
90         PtNodeArray mChildren;
91         boolean mIsNotAWord; // Only a shortcut
92         boolean mIsBlacklistEntry;
93         // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
94         // generation. Before and After always hold the same value except during dictionary
95         // address compression, where the update process needs to know about both values at the
96         // same time. Updating will update the AfterUpdate value, and the code will move them
97         // to BeforeUpdate before the next update pass.
98         // The update process does not need two versions of mCachedSize.
99         int mCachedSize; // The size, in bytes, of this PtNode.
100         int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
101         int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
102 
PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isBlacklistEntry)103         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
104                 final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
105                 final boolean isNotAWord, final boolean isBlacklistEntry) {
106             mChars = chars;
107             mProbabilityInfo = probabilityInfo;
108             mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
109             mShortcutTargets = shortcutTargets;
110             mBigrams = bigrams;
111             mChildren = null;
112             mIsNotAWord = isNotAWord;
113             mIsBlacklistEntry = isBlacklistEntry;
114         }
115 
PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isBlacklistEntry, final PtNodeArray children)116         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
117                 final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
118                 final boolean isNotAWord, final boolean isBlacklistEntry,
119                 final PtNodeArray children) {
120             mChars = chars;
121             mProbabilityInfo = probabilityInfo;
122             mShortcutTargets = shortcutTargets;
123             mBigrams = bigrams;
124             mChildren = children;
125             mIsNotAWord = isNotAWord;
126             mIsBlacklistEntry = isBlacklistEntry;
127         }
128 
addChild(PtNode n)129         public void addChild(PtNode n) {
130             if (null == mChildren) {
131                 mChildren = new PtNodeArray();
132             }
133             mChildren.mData.add(n);
134         }
135 
getTerminalId()136         public int getTerminalId() {
137             return mTerminalId;
138         }
139 
isTerminal()140         public boolean isTerminal() {
141             return mProbabilityInfo != null;
142         }
143 
getProbability()144         public int getProbability() {
145             if (isTerminal()) {
146                 return mProbabilityInfo.mProbability;
147             } else {
148                 return NOT_A_TERMINAL;
149             }
150         }
151 
getIsNotAWord()152         public boolean getIsNotAWord() {
153             return mIsNotAWord;
154         }
155 
getIsBlacklistEntry()156         public boolean getIsBlacklistEntry() {
157             return mIsBlacklistEntry;
158         }
159 
getShortcutTargets()160         public ArrayList<WeightedString> getShortcutTargets() {
161             // We don't want write permission to escape outside the package, so we return a copy
162             if (null == mShortcutTargets) return null;
163             final ArrayList<WeightedString> copyOfShortcutTargets =
164                     new ArrayList<>(mShortcutTargets);
165             return copyOfShortcutTargets;
166         }
167 
getBigrams()168         public ArrayList<WeightedString> getBigrams() {
169             // We don't want write permission to escape outside the package, so we return a copy
170             if (null == mBigrams) return null;
171             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams);
172             return copyOfBigrams;
173         }
174 
hasSeveralChars()175         public boolean hasSeveralChars() {
176             assert(mChars.length > 0);
177             return 1 < mChars.length;
178         }
179 
180         /**
181          * Adds a word to the bigram list. Updates the probability information if the word already
182          * exists.
183          */
addBigram(final String word, final ProbabilityInfo probabilityInfo)184         public void addBigram(final String word, final ProbabilityInfo probabilityInfo) {
185             if (mBigrams == null) {
186                 mBigrams = new ArrayList<>();
187             }
188             WeightedString bigram = getBigram(word);
189             if (bigram != null) {
190                 bigram.mProbabilityInfo = probabilityInfo;
191             } else {
192                 bigram = new WeightedString(word, probabilityInfo);
193                 mBigrams.add(bigram);
194             }
195         }
196 
197         /**
198          * Gets the shortcut target for the given word. Returns null if the word is not in the
199          * shortcut list.
200          */
getShortcut(final String word)201         public WeightedString getShortcut(final String word) {
202             // TODO: Don't do a linear search
203             if (mShortcutTargets != null) {
204                 final int size = mShortcutTargets.size();
205                 for (int i = 0; i < size; ++i) {
206                     WeightedString shortcut = mShortcutTargets.get(i);
207                     if (shortcut.mWord.equals(word)) {
208                         return shortcut;
209                     }
210                 }
211             }
212             return null;
213         }
214 
215         /**
216          * Gets the bigram for the given word.
217          * Returns null if the word is not in the bigrams list.
218          */
getBigram(final String word)219         public WeightedString getBigram(final String word) {
220             // TODO: Don't do a linear search
221             if (mBigrams != null) {
222                 final int size = mBigrams.size();
223                 for (int i = 0; i < size; ++i) {
224                     WeightedString bigram = mBigrams.get(i);
225                     if (bigram.mWord.equals(word)) {
226                         return bigram;
227                     }
228                 }
229             }
230             return null;
231         }
232 
233         /**
234          * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
235          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
236          * updated if they are higher than the existing ones.
237          */
update(final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isBlacklistEntry)238         private void update(final ProbabilityInfo probabilityInfo,
239                 final ArrayList<WeightedString> shortcutTargets,
240                 final ArrayList<WeightedString> bigrams,
241                 final boolean isNotAWord, final boolean isBlacklistEntry) {
242             mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
243             if (shortcutTargets != null) {
244                 if (mShortcutTargets == null) {
245                     mShortcutTargets = shortcutTargets;
246                 } else {
247                     final int size = shortcutTargets.size();
248                     for (int i = 0; i < size; ++i) {
249                         final WeightedString shortcut = shortcutTargets.get(i);
250                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
251                         if (existingShortcut == null) {
252                             mShortcutTargets.add(shortcut);
253                         } else {
254                             existingShortcut.mProbabilityInfo = ProbabilityInfo.max(
255                                     existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo);
256                         }
257                     }
258                 }
259             }
260             if (bigrams != null) {
261                 if (mBigrams == null) {
262                     mBigrams = bigrams;
263                 } else {
264                     final int size = bigrams.size();
265                     for (int i = 0; i < size; ++i) {
266                         final WeightedString bigram = bigrams.get(i);
267                         final WeightedString existingBigram = getBigram(bigram.mWord);
268                         if (existingBigram == null) {
269                             mBigrams.add(bigram);
270                         } else {
271                             existingBigram.mProbabilityInfo = ProbabilityInfo.max(
272                                     existingBigram.mProbabilityInfo, bigram.mProbabilityInfo);
273                         }
274                     }
275                 }
276             }
277             mIsNotAWord = isNotAWord;
278             mIsBlacklistEntry = isBlacklistEntry;
279         }
280     }
281 
282     public final DictionaryOptions mOptions;
283     public final PtNodeArray mRootNodeArray;
284 
FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options)285     public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) {
286         mRootNodeArray = rootNodeArray;
287         mOptions = options;
288     }
289 
addOptionAttribute(final String key, final String value)290     public void addOptionAttribute(final String key, final String value) {
291         mOptions.mAttributes.put(key, value);
292     }
293 
294     /**
295      * Helper method to convert a String to an int array.
296      */
getCodePoints(final String word)297     static int[] getCodePoints(final String word) {
298         // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray,
299         // which is not visible from the makedict package. Factor this code.
300         final int length = word.length();
301         if (length <= 0) return new int[] {};
302         final char[] characters = word.toCharArray();
303         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
304         int codePoint = Character.codePointAt(characters, 0);
305         int dsti = 0;
306         for (int srci = Character.charCount(codePoint);
307                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
308             codePoints[dsti] = codePoint;
309             codePoint = Character.codePointAt(characters, srci);
310         }
311         codePoints[dsti] = codePoint;
312         return codePoints;
313     }
314 
315     /**
316      * Helper method to add a word as a string.
317      *
318      * This method adds a word to the dictionary with the given frequency. Optional
319      * lists of bigrams and shortcuts can be passed here. For each word inside,
320      * they will be added to the dictionary as necessary.
321      *
322      * @param word the word to add.
323      * @param probabilityInfo probability information of the word.
324      * @param shortcutTargets a list of shortcut targets for this word, or null.
325      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
326      */
add(final String word, final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)327     public void add(final String word, final ProbabilityInfo probabilityInfo,
328             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
329         add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord,
330                 false /* isBlacklistEntry */);
331     }
332 
333     /**
334      * Helper method to add a blacklist entry as a string.
335      *
336      * @param word the word to add as a blacklist entry.
337      * @param shortcutTargets a list of shortcut targets for this word, or null.
338      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
339      */
addBlacklistEntry(final String word, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)340     public void addBlacklistEntry(final String word,
341             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
342         add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord,
343                 true /* isBlacklistEntry */);
344     }
345 
346     /**
347      * Sanity check for a PtNode array.
348      *
349      * This method checks that all PtNodes in a node array are ordered as expected.
350      * If they are, nothing happens. If they aren't, an exception is thrown.
351      */
checkStack(PtNodeArray ptNodeArray)352     private void checkStack(PtNodeArray ptNodeArray) {
353         ArrayList<PtNode> stack = ptNodeArray.mData;
354         int lastValue = -1;
355         for (int i = 0; i < stack.size(); ++i) {
356             int currentValue = stack.get(i).mChars[0];
357             if (currentValue <= lastValue)
358                 throw new RuntimeException("Invalid stack");
359             else
360                 lastValue = currentValue;
361         }
362     }
363 
364     /**
365      * Helper method to add a new bigram to the dictionary.
366      *
367      * @param word0 the previous word of the context
368      * @param word1 the next word of the context
369      * @param probabilityInfo the bigram probability info
370      */
setBigram(final String word0, final String word1, final ProbabilityInfo probabilityInfo)371     public void setBigram(final String word0, final String word1,
372             final ProbabilityInfo probabilityInfo) {
373         PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
374         if (ptNode0 != null) {
375             final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
376             if (ptNode1 == null) {
377                 add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
378                         false /* isBlacklistEntry */);
379                 // The PtNode for the first word may have moved by the above insertion,
380                 // if word1 and word2 share a common stem that happens not to have been
381                 // a cutting point until now. In this case, we need to refresh ptNode.
382                 ptNode0 = findWordInTree(mRootNodeArray, word0);
383             }
384             ptNode0.addBigram(word1, probabilityInfo);
385         } else {
386             throw new RuntimeException("First word of bigram not found " + word0);
387         }
388     }
389 
390     /**
391      * Add a word to this dictionary.
392      *
393      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
394      * an exception is thrown.
395      *
396      * @param word the word, as an int array.
397      * @param probabilityInfo the probability information of the word.
398      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
399      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
400      * @param isBlacklistEntry true if this is a blacklisted word, false otherwise
401      */
add(final int[] word, final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord, final boolean isBlacklistEntry)402     private void add(final int[] word, final ProbabilityInfo probabilityInfo,
403             final ArrayList<WeightedString> shortcutTargets,
404             final boolean isNotAWord, final boolean isBlacklistEntry) {
405         assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
406         if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
407             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
408             return;
409         }
410 
411         PtNodeArray currentNodeArray = mRootNodeArray;
412         int charIndex = 0;
413 
414         PtNode currentPtNode = null;
415         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
416         int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
417         while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
418             currentPtNode = currentNodeArray.mData.get(nodeIndex);
419             differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
420             if (ARRAYS_ARE_EQUAL != differentCharIndex
421                     && differentCharIndex < currentPtNode.mChars.length) break;
422             if (null == currentPtNode.mChildren) break;
423             charIndex += currentPtNode.mChars.length;
424             if (charIndex >= word.length) break;
425             currentNodeArray = currentPtNode.mChildren;
426             nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
427         }
428 
429         if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
430             // No node at this point to accept the word. Create one.
431             final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
432             final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
433                     shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
434                     isBlacklistEntry);
435             currentNodeArray.mData.add(insertionIndex, newPtNode);
436             if (DBG) checkStack(currentNodeArray);
437         } else {
438             // There is a word with a common prefix.
439             if (differentCharIndex == currentPtNode.mChars.length) {
440                 if (charIndex + differentCharIndex >= word.length) {
441                     // The new word is a prefix of an existing word, but the node on which it
442                     // should end already exists as is. Since the old PtNode was not a terminal,
443                     // make it one by filling in its frequency and other attributes
444                     currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
445                             isBlacklistEntry);
446                 } else {
447                     // The new word matches the full old word and extends past it.
448                     // We only have to create a new node and add it to the end of this.
449                     final PtNode newNode = new PtNode(
450                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
451                                     shortcutTargets, null /* bigrams */, probabilityInfo,
452                                     isNotAWord, isBlacklistEntry);
453                     currentPtNode.mChildren = new PtNodeArray();
454                     currentPtNode.mChildren.mData.add(newNode);
455                 }
456             } else {
457                 if (0 == differentCharIndex) {
458                     // Exact same word. Update the frequency if higher. This will also add the
459                     // new shortcuts to the existing shortcut list if it already exists.
460                     currentPtNode.update(probabilityInfo, shortcutTargets, null,
461                             currentPtNode.mIsNotAWord && isNotAWord,
462                             currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
463                 } else {
464                     // Partial prefix match only. We have to replace the current node with a node
465                     // containing the current prefix and create two new ones for the tails.
466                     PtNodeArray newChildren = new PtNodeArray();
467                     final PtNode newOldWord = new PtNode(
468                             Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
469                                     currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
470                             currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
471                             currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
472                             currentPtNode.mChildren);
473                     newChildren.mData.add(newOldWord);
474 
475                     final PtNode newParent;
476                     if (charIndex + differentCharIndex >= word.length) {
477                         newParent = new PtNode(
478                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
479                                 shortcutTargets, null /* bigrams */, probabilityInfo,
480                                 isNotAWord, isBlacklistEntry, newChildren);
481                     } else {
482                         newParent = new PtNode(
483                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
484                                 null /* shortcutTargets */, null /* bigrams */,
485                                 null /* probabilityInfo */, false /* isNotAWord */,
486                                 false /* isBlacklistEntry */, newChildren);
487                         final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
488                                 charIndex + differentCharIndex, word.length),
489                                 shortcutTargets, null /* bigrams */, probabilityInfo,
490                                 isNotAWord, isBlacklistEntry);
491                         final int addIndex = word[charIndex + differentCharIndex]
492                                 > currentPtNode.mChars[differentCharIndex] ? 1 : 0;
493                         newChildren.mData.add(addIndex, newWord);
494                     }
495                     currentNodeArray.mData.set(nodeIndex, newParent);
496                 }
497                 if (DBG) checkStack(currentNodeArray);
498             }
499         }
500     }
501 
502     private static int ARRAYS_ARE_EQUAL = 0;
503 
504     /**
505      * Custom comparison of two int arrays taken to contain character codes.
506      *
507      * This method compares the two arrays passed as an argument in a lexicographic way,
508      * with an offset in the dst string.
509      * This method does NOT test for the first character. It is taken to be equal.
510      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
511      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
512      * strings are equal. This works BECAUSE we don't look at the first character.
513      *
514      * @param src the left-hand side string of the comparison.
515      * @param dst the right-hand side string of the comparison.
516      * @param dstOffset the offset in the right-hand side string.
517      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
518      */
compareCharArrays(final int[] src, final int[] dst, int dstOffset)519     private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) {
520         // We do NOT test the first char, because we come from a method that already
521         // tested it.
522         for (int i = 1; i < src.length; ++i) {
523             if (dstOffset + i >= dst.length) return i;
524             if (src[i] != dst[dstOffset + i]) return i;
525         }
526         if (dst.length > src.length) return src.length;
527         return ARRAYS_ARE_EQUAL;
528     }
529 
530     /**
531      * Helper class that compares and sorts two PtNodes according to their
532      * first element only. I repeat: ONLY the first element is considered, the rest
533      * is ignored.
534      * This comparator imposes orderings that are inconsistent with equals.
535      */
536     static private final class PtNodeComparator implements java.util.Comparator<PtNode> {
537         @Override
compare(PtNode p1, PtNode p2)538         public int compare(PtNode p1, PtNode p2) {
539             if (p1.mChars[0] == p2.mChars[0]) return 0;
540             return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
541         }
542     }
543     final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
544 
545     /**
546      * Finds the insertion index of a character within a node array.
547      */
findInsertionIndex(final PtNodeArray nodeArray, int character)548     private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
549         final ArrayList<PtNode> data = nodeArray.mData;
550         final PtNode reference = new PtNode(new int[] { character },
551                 null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
552                 false /* isNotAWord */, false /* isBlacklistEntry */);
553         int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
554         return result >= 0 ? result : -result - 1;
555     }
556 
557     /**
558      * Find the index of a char in a node array, if it exists.
559      *
560      * @param nodeArray the node array to search in.
561      * @param character the character to search for.
562      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else.
563      */
findIndexOfChar(final PtNodeArray nodeArray, int character)564     private static int findIndexOfChar(final PtNodeArray nodeArray, int character) {
565         final int insertionIndex = findInsertionIndex(nodeArray, character);
566         if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX;
567         return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex
568                 : CHARACTER_NOT_FOUND_INDEX;
569     }
570 
571     /**
572      * Helper method to find a word in a given branch.
573      */
findWordInTree(PtNodeArray nodeArray, final String string)574     public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) {
575         int index = 0;
576         final StringBuilder checker = DBG ? new StringBuilder() : null;
577         final int[] codePoints = getCodePoints(string);
578 
579         PtNode currentPtNode;
580         do {
581             int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
582             if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
583             currentPtNode = nodeArray.mData.get(indexOfGroup);
584 
585             if (codePoints.length - index < currentPtNode.mChars.length) return null;
586             int newIndex = index;
587             while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
588                 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
589                 newIndex++;
590             }
591             index = newIndex;
592 
593             if (DBG) {
594                 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
595             }
596             if (index < codePoints.length) {
597                 nodeArray = currentPtNode.mChildren;
598             }
599         } while (null != nodeArray && index < codePoints.length);
600 
601         if (index < codePoints.length) return null;
602         if (!currentPtNode.isTerminal()) return null;
603         if (DBG && !string.equals(checker.toString())) return null;
604         return currentPtNode;
605     }
606 
607     /**
608      * Helper method to find out whether a word is in the dict or not.
609      */
hasWord(final String s)610     public boolean hasWord(final String s) {
611         if (null == s || "".equals(s)) {
612             throw new RuntimeException("Can't search for a null or empty string");
613         }
614         return null != findWordInTree(mRootNodeArray, s);
615     }
616 
617     /**
618      * Recursively count the number of PtNodes in a given branch of the trie.
619      *
620      * @param nodeArray the parent node.
621      * @return the number of PtNodes in all the branch under this node.
622      */
countPtNodes(final PtNodeArray nodeArray)623     public static int countPtNodes(final PtNodeArray nodeArray) {
624         final int nodeSize = nodeArray.mData.size();
625         int size = nodeSize;
626         for (int i = nodeSize - 1; i >= 0; --i) {
627             PtNode ptNode = nodeArray.mData.get(i);
628             if (null != ptNode.mChildren)
629                 size += countPtNodes(ptNode.mChildren);
630         }
631         return size;
632     }
633 
634     /**
635      * Iterator to walk through a dictionary.
636      *
637      * This is purely for convenience.
638      */
639     public static final class DictionaryIterator implements Iterator<WordProperty> {
640         private static final class Position {
641             public Iterator<PtNode> pos;
642             public int length;
Position(ArrayList<PtNode> ptNodes)643             public Position(ArrayList<PtNode> ptNodes) {
644                 pos = ptNodes.iterator();
645                 length = 0;
646             }
647         }
648         final StringBuilder mCurrentString;
649         final LinkedList<Position> mPositions;
650 
DictionaryIterator(ArrayList<PtNode> ptRoot)651         public DictionaryIterator(ArrayList<PtNode> ptRoot) {
652             mCurrentString = new StringBuilder();
653             mPositions = new LinkedList<>();
654             final Position rootPos = new Position(ptRoot);
655             mPositions.add(rootPos);
656         }
657 
658         @Override
hasNext()659         public boolean hasNext() {
660             for (Position p : mPositions) {
661                 if (p.pos.hasNext()) {
662                     return true;
663                 }
664             }
665             return false;
666         }
667 
668         @Override
next()669         public WordProperty next() {
670             Position currentPos = mPositions.getLast();
671             mCurrentString.setLength(currentPos.length);
672 
673             do {
674                 if (currentPos.pos.hasNext()) {
675                     final PtNode currentPtNode = currentPos.pos.next();
676                     currentPos.length = mCurrentString.length();
677                     for (int i : currentPtNode.mChars) {
678                         mCurrentString.append(Character.toChars(i));
679                     }
680                     if (null != currentPtNode.mChildren) {
681                         currentPos = new Position(currentPtNode.mChildren.mData);
682                         currentPos.length = mCurrentString.length();
683                         mPositions.addLast(currentPos);
684                     }
685                     if (currentPtNode.isTerminal()) {
686                         return new WordProperty(mCurrentString.toString(),
687                                 currentPtNode.mProbabilityInfo,
688                                 currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
689                                 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
690                     }
691                 } else {
692                     mPositions.removeLast();
693                     currentPos = mPositions.getLast();
694                     mCurrentString.setLength(mPositions.getLast().length);
695                 }
696             } while (true);
697         }
698 
699         @Override
remove()700         public void remove() {
701             throw new UnsupportedOperationException("Unsupported yet");
702         }
703 
704     }
705 
706     /**
707      * Method to return an iterator.
708      *
709      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
710      * and say : for (Word w : x) {}
711      */
712     @Override
iterator()713     public Iterator<WordProperty> iterator() {
714         return new DictionaryIterator(mRootNodeArray.mData);
715     }
716 }
717