1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 22 23 import java.util.ArrayList; 24 import java.util.Arrays; 25 import java.util.Collections; 26 import java.util.Iterator; 27 import java.util.LinkedList; 28 29 /** 30 * A dictionary that can fusion heads and tails of words for more compression. 31 */ 32 @UsedForTesting 33 public final class FusionDictionary implements Iterable<WordProperty> { 34 private static final boolean DBG = MakedictLog.DBG; 35 36 private static int CHARACTER_NOT_FOUND_INDEX = -1; 37 38 /** 39 * A node array of the dictionary, containing several PtNodes. 40 * 41 * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the 42 * real information. 43 * This class also contains fields to cache size and address, to help with binary 44 * generation. 45 */ 46 public static final class PtNodeArray { 47 ArrayList<PtNode> mData; 48 // To help with binary generation 49 int mCachedSize = Integer.MIN_VALUE; 50 // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 51 // always hold the same value except between dictionary address compression, during which 52 // the update process needs to know about both values at the same time. Updating will 53 // update the AfterUpdate value, and the code will move them to BeforeUpdate before 54 // the next update pass. 55 int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 56 int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 57 int mCachedParentAddress = 0; 58 PtNodeArray()59 public PtNodeArray() { 60 mData = new ArrayList<>(); 61 } PtNodeArray(ArrayList<PtNode> data)62 public PtNodeArray(ArrayList<PtNode> data) { 63 Collections.sort(data, PTNODE_COMPARATOR); 64 mData = data; 65 } 66 } 67 68 /** 69 * PtNode is a group of characters, with probability information, shortcut targets, bigrams, 70 * and children (Pt means Patricia Trie). 71 * 72 * This is the central class of the in-memory representation. A PtNode is what can 73 * be seen as a traditional "trie node", except it can hold several characters at the 74 * same time. A PtNode essentially represents one or several characters in the middle 75 * of the trie tree; as such, it can be a terminal, and it can have children. 76 * In this in-memory representation, whether the PtNode is a terminal or not is represented 77 * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the 78 * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null 79 * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present, 80 * are non-null. 81 */ 82 public static final class PtNode { 83 private static final int NOT_A_TERMINAL = -1; 84 final int mChars[]; 85 ArrayList<WeightedString> mShortcutTargets; 86 ArrayList<WeightedString> mBigrams; 87 // null == mProbabilityInfo indicates this is not a terminal. 88 ProbabilityInfo mProbabilityInfo; 89 int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. 90 PtNodeArray mChildren; 91 boolean mIsNotAWord; // Only a shortcut 92 boolean mIsBlacklistEntry; 93 // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 94 // generation. Before and After always hold the same value except during dictionary 95 // address compression, where the update process needs to know about both values at the 96 // same time. Updating will update the AfterUpdate value, and the code will move them 97 // to BeforeUpdate before the next update pass. 98 // The update process does not need two versions of mCachedSize. 99 int mCachedSize; // The size, in bytes, of this PtNode. 100 int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) 101 int mCachedAddressAfterUpdate; // The address of this PtNode (after update) 102 PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isBlacklistEntry)103 public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 104 final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, 105 final boolean isNotAWord, final boolean isBlacklistEntry) { 106 mChars = chars; 107 mProbabilityInfo = probabilityInfo; 108 mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability; 109 mShortcutTargets = shortcutTargets; 110 mBigrams = bigrams; 111 mChildren = null; 112 mIsNotAWord = isNotAWord; 113 mIsBlacklistEntry = isBlacklistEntry; 114 } 115 PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isBlacklistEntry, final PtNodeArray children)116 public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 117 final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, 118 final boolean isNotAWord, final boolean isBlacklistEntry, 119 final PtNodeArray children) { 120 mChars = chars; 121 mProbabilityInfo = probabilityInfo; 122 mShortcutTargets = shortcutTargets; 123 mBigrams = bigrams; 124 mChildren = children; 125 mIsNotAWord = isNotAWord; 126 mIsBlacklistEntry = isBlacklistEntry; 127 } 128 addChild(PtNode n)129 public void addChild(PtNode n) { 130 if (null == mChildren) { 131 mChildren = new PtNodeArray(); 132 } 133 mChildren.mData.add(n); 134 } 135 getTerminalId()136 public int getTerminalId() { 137 return mTerminalId; 138 } 139 isTerminal()140 public boolean isTerminal() { 141 return mProbabilityInfo != null; 142 } 143 getProbability()144 public int getProbability() { 145 if (isTerminal()) { 146 return mProbabilityInfo.mProbability; 147 } else { 148 return NOT_A_TERMINAL; 149 } 150 } 151 getIsNotAWord()152 public boolean getIsNotAWord() { 153 return mIsNotAWord; 154 } 155 getIsBlacklistEntry()156 public boolean getIsBlacklistEntry() { 157 return mIsBlacklistEntry; 158 } 159 getShortcutTargets()160 public ArrayList<WeightedString> getShortcutTargets() { 161 // We don't want write permission to escape outside the package, so we return a copy 162 if (null == mShortcutTargets) return null; 163 final ArrayList<WeightedString> copyOfShortcutTargets = 164 new ArrayList<>(mShortcutTargets); 165 return copyOfShortcutTargets; 166 } 167 getBigrams()168 public ArrayList<WeightedString> getBigrams() { 169 // We don't want write permission to escape outside the package, so we return a copy 170 if (null == mBigrams) return null; 171 final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams); 172 return copyOfBigrams; 173 } 174 hasSeveralChars()175 public boolean hasSeveralChars() { 176 assert(mChars.length > 0); 177 return 1 < mChars.length; 178 } 179 180 /** 181 * Adds a word to the bigram list. Updates the probability information if the word already 182 * exists. 183 */ addBigram(final String word, final ProbabilityInfo probabilityInfo)184 public void addBigram(final String word, final ProbabilityInfo probabilityInfo) { 185 if (mBigrams == null) { 186 mBigrams = new ArrayList<>(); 187 } 188 WeightedString bigram = getBigram(word); 189 if (bigram != null) { 190 bigram.mProbabilityInfo = probabilityInfo; 191 } else { 192 bigram = new WeightedString(word, probabilityInfo); 193 mBigrams.add(bigram); 194 } 195 } 196 197 /** 198 * Gets the shortcut target for the given word. Returns null if the word is not in the 199 * shortcut list. 200 */ getShortcut(final String word)201 public WeightedString getShortcut(final String word) { 202 // TODO: Don't do a linear search 203 if (mShortcutTargets != null) { 204 final int size = mShortcutTargets.size(); 205 for (int i = 0; i < size; ++i) { 206 WeightedString shortcut = mShortcutTargets.get(i); 207 if (shortcut.mWord.equals(word)) { 208 return shortcut; 209 } 210 } 211 } 212 return null; 213 } 214 215 /** 216 * Gets the bigram for the given word. 217 * Returns null if the word is not in the bigrams list. 218 */ getBigram(final String word)219 public WeightedString getBigram(final String word) { 220 // TODO: Don't do a linear search 221 if (mBigrams != null) { 222 final int size = mBigrams.size(); 223 for (int i = 0; i < size; ++i) { 224 WeightedString bigram = mBigrams.get(i); 225 if (bigram.mWord.equals(word)) { 226 return bigram; 227 } 228 } 229 } 230 return null; 231 } 232 233 /** 234 * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to 235 * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 236 * updated if they are higher than the existing ones. 237 */ update(final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isBlacklistEntry)238 private void update(final ProbabilityInfo probabilityInfo, 239 final ArrayList<WeightedString> shortcutTargets, 240 final ArrayList<WeightedString> bigrams, 241 final boolean isNotAWord, final boolean isBlacklistEntry) { 242 mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo); 243 if (shortcutTargets != null) { 244 if (mShortcutTargets == null) { 245 mShortcutTargets = shortcutTargets; 246 } else { 247 final int size = shortcutTargets.size(); 248 for (int i = 0; i < size; ++i) { 249 final WeightedString shortcut = shortcutTargets.get(i); 250 final WeightedString existingShortcut = getShortcut(shortcut.mWord); 251 if (existingShortcut == null) { 252 mShortcutTargets.add(shortcut); 253 } else { 254 existingShortcut.mProbabilityInfo = ProbabilityInfo.max( 255 existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo); 256 } 257 } 258 } 259 } 260 if (bigrams != null) { 261 if (mBigrams == null) { 262 mBigrams = bigrams; 263 } else { 264 final int size = bigrams.size(); 265 for (int i = 0; i < size; ++i) { 266 final WeightedString bigram = bigrams.get(i); 267 final WeightedString existingBigram = getBigram(bigram.mWord); 268 if (existingBigram == null) { 269 mBigrams.add(bigram); 270 } else { 271 existingBigram.mProbabilityInfo = ProbabilityInfo.max( 272 existingBigram.mProbabilityInfo, bigram.mProbabilityInfo); 273 } 274 } 275 } 276 } 277 mIsNotAWord = isNotAWord; 278 mIsBlacklistEntry = isBlacklistEntry; 279 } 280 } 281 282 public final DictionaryOptions mOptions; 283 public final PtNodeArray mRootNodeArray; 284 FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options)285 public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { 286 mRootNodeArray = rootNodeArray; 287 mOptions = options; 288 } 289 addOptionAttribute(final String key, final String value)290 public void addOptionAttribute(final String key, final String value) { 291 mOptions.mAttributes.put(key, value); 292 } 293 294 /** 295 * Helper method to convert a String to an int array. 296 */ getCodePoints(final String word)297 static int[] getCodePoints(final String word) { 298 // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, 299 // which is not visible from the makedict package. Factor this code. 300 final int length = word.length(); 301 if (length <= 0) return new int[] {}; 302 final char[] characters = word.toCharArray(); 303 final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 304 int codePoint = Character.codePointAt(characters, 0); 305 int dsti = 0; 306 for (int srci = Character.charCount(codePoint); 307 srci < length; srci += Character.charCount(codePoint), ++dsti) { 308 codePoints[dsti] = codePoint; 309 codePoint = Character.codePointAt(characters, srci); 310 } 311 codePoints[dsti] = codePoint; 312 return codePoints; 313 } 314 315 /** 316 * Helper method to add a word as a string. 317 * 318 * This method adds a word to the dictionary with the given frequency. Optional 319 * lists of bigrams and shortcuts can be passed here. For each word inside, 320 * they will be added to the dictionary as necessary. 321 * 322 * @param word the word to add. 323 * @param probabilityInfo probability information of the word. 324 * @param shortcutTargets a list of shortcut targets for this word, or null. 325 * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 326 */ add(final String word, final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)327 public void add(final String word, final ProbabilityInfo probabilityInfo, 328 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 329 add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord, 330 false /* isBlacklistEntry */); 331 } 332 333 /** 334 * Helper method to add a blacklist entry as a string. 335 * 336 * @param word the word to add as a blacklist entry. 337 * @param shortcutTargets a list of shortcut targets for this word, or null. 338 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 339 */ addBlacklistEntry(final String word, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)340 public void addBlacklistEntry(final String word, 341 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 342 add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord, 343 true /* isBlacklistEntry */); 344 } 345 346 /** 347 * Sanity check for a PtNode array. 348 * 349 * This method checks that all PtNodes in a node array are ordered as expected. 350 * If they are, nothing happens. If they aren't, an exception is thrown. 351 */ checkStack(PtNodeArray ptNodeArray)352 private void checkStack(PtNodeArray ptNodeArray) { 353 ArrayList<PtNode> stack = ptNodeArray.mData; 354 int lastValue = -1; 355 for (int i = 0; i < stack.size(); ++i) { 356 int currentValue = stack.get(i).mChars[0]; 357 if (currentValue <= lastValue) 358 throw new RuntimeException("Invalid stack"); 359 else 360 lastValue = currentValue; 361 } 362 } 363 364 /** 365 * Helper method to add a new bigram to the dictionary. 366 * 367 * @param word0 the previous word of the context 368 * @param word1 the next word of the context 369 * @param probabilityInfo the bigram probability info 370 */ setBigram(final String word0, final String word1, final ProbabilityInfo probabilityInfo)371 public void setBigram(final String word0, final String word1, 372 final ProbabilityInfo probabilityInfo) { 373 PtNode ptNode0 = findWordInTree(mRootNodeArray, word0); 374 if (ptNode0 != null) { 375 final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1); 376 if (ptNode1 == null) { 377 add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */, 378 false /* isBlacklistEntry */); 379 // The PtNode for the first word may have moved by the above insertion, 380 // if word1 and word2 share a common stem that happens not to have been 381 // a cutting point until now. In this case, we need to refresh ptNode. 382 ptNode0 = findWordInTree(mRootNodeArray, word0); 383 } 384 ptNode0.addBigram(word1, probabilityInfo); 385 } else { 386 throw new RuntimeException("First word of bigram not found " + word0); 387 } 388 } 389 390 /** 391 * Add a word to this dictionary. 392 * 393 * The shortcuts, if any, have to be in the dictionary already. If they aren't, 394 * an exception is thrown. 395 * 396 * @param word the word, as an int array. 397 * @param probabilityInfo the probability information of the word. 398 * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 399 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 400 * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 401 */ add(final int[] word, final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord, final boolean isBlacklistEntry)402 private void add(final int[] word, final ProbabilityInfo probabilityInfo, 403 final ArrayList<WeightedString> shortcutTargets, 404 final boolean isNotAWord, final boolean isBlacklistEntry) { 405 assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY); 406 if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { 407 MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 408 return; 409 } 410 411 PtNodeArray currentNodeArray = mRootNodeArray; 412 int charIndex = 0; 413 414 PtNode currentPtNode = null; 415 int differentCharIndex = 0; // Set by the loop to the index of the char that differs 416 int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); 417 while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 418 currentPtNode = currentNodeArray.mData.get(nodeIndex); 419 differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); 420 if (ARRAYS_ARE_EQUAL != differentCharIndex 421 && differentCharIndex < currentPtNode.mChars.length) break; 422 if (null == currentPtNode.mChildren) break; 423 charIndex += currentPtNode.mChars.length; 424 if (charIndex >= word.length) break; 425 currentNodeArray = currentPtNode.mChildren; 426 nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); 427 } 428 429 if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 430 // No node at this point to accept the word. Create one. 431 final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); 432 final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), 433 shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord, 434 isBlacklistEntry); 435 currentNodeArray.mData.add(insertionIndex, newPtNode); 436 if (DBG) checkStack(currentNodeArray); 437 } else { 438 // There is a word with a common prefix. 439 if (differentCharIndex == currentPtNode.mChars.length) { 440 if (charIndex + differentCharIndex >= word.length) { 441 // The new word is a prefix of an existing word, but the node on which it 442 // should end already exists as is. Since the old PtNode was not a terminal, 443 // make it one by filling in its frequency and other attributes 444 currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord, 445 isBlacklistEntry); 446 } else { 447 // The new word matches the full old word and extends past it. 448 // We only have to create a new node and add it to the end of this. 449 final PtNode newNode = new PtNode( 450 Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 451 shortcutTargets, null /* bigrams */, probabilityInfo, 452 isNotAWord, isBlacklistEntry); 453 currentPtNode.mChildren = new PtNodeArray(); 454 currentPtNode.mChildren.mData.add(newNode); 455 } 456 } else { 457 if (0 == differentCharIndex) { 458 // Exact same word. Update the frequency if higher. This will also add the 459 // new shortcuts to the existing shortcut list if it already exists. 460 currentPtNode.update(probabilityInfo, shortcutTargets, null, 461 currentPtNode.mIsNotAWord && isNotAWord, 462 currentPtNode.mIsBlacklistEntry || isBlacklistEntry); 463 } else { 464 // Partial prefix match only. We have to replace the current node with a node 465 // containing the current prefix and create two new ones for the tails. 466 PtNodeArray newChildren = new PtNodeArray(); 467 final PtNode newOldWord = new PtNode( 468 Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, 469 currentPtNode.mChars.length), currentPtNode.mShortcutTargets, 470 currentPtNode.mBigrams, currentPtNode.mProbabilityInfo, 471 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, 472 currentPtNode.mChildren); 473 newChildren.mData.add(newOldWord); 474 475 final PtNode newParent; 476 if (charIndex + differentCharIndex >= word.length) { 477 newParent = new PtNode( 478 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 479 shortcutTargets, null /* bigrams */, probabilityInfo, 480 isNotAWord, isBlacklistEntry, newChildren); 481 } else { 482 newParent = new PtNode( 483 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 484 null /* shortcutTargets */, null /* bigrams */, 485 null /* probabilityInfo */, false /* isNotAWord */, 486 false /* isBlacklistEntry */, newChildren); 487 final PtNode newWord = new PtNode(Arrays.copyOfRange(word, 488 charIndex + differentCharIndex, word.length), 489 shortcutTargets, null /* bigrams */, probabilityInfo, 490 isNotAWord, isBlacklistEntry); 491 final int addIndex = word[charIndex + differentCharIndex] 492 > currentPtNode.mChars[differentCharIndex] ? 1 : 0; 493 newChildren.mData.add(addIndex, newWord); 494 } 495 currentNodeArray.mData.set(nodeIndex, newParent); 496 } 497 if (DBG) checkStack(currentNodeArray); 498 } 499 } 500 } 501 502 private static int ARRAYS_ARE_EQUAL = 0; 503 504 /** 505 * Custom comparison of two int arrays taken to contain character codes. 506 * 507 * This method compares the two arrays passed as an argument in a lexicographic way, 508 * with an offset in the dst string. 509 * This method does NOT test for the first character. It is taken to be equal. 510 * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 511 * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 512 * strings are equal. This works BECAUSE we don't look at the first character. 513 * 514 * @param src the left-hand side string of the comparison. 515 * @param dst the right-hand side string of the comparison. 516 * @param dstOffset the offset in the right-hand side string. 517 * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 518 */ compareCharArrays(final int[] src, final int[] dst, int dstOffset)519 private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { 520 // We do NOT test the first char, because we come from a method that already 521 // tested it. 522 for (int i = 1; i < src.length; ++i) { 523 if (dstOffset + i >= dst.length) return i; 524 if (src[i] != dst[dstOffset + i]) return i; 525 } 526 if (dst.length > src.length) return src.length; 527 return ARRAYS_ARE_EQUAL; 528 } 529 530 /** 531 * Helper class that compares and sorts two PtNodes according to their 532 * first element only. I repeat: ONLY the first element is considered, the rest 533 * is ignored. 534 * This comparator imposes orderings that are inconsistent with equals. 535 */ 536 static private final class PtNodeComparator implements java.util.Comparator<PtNode> { 537 @Override compare(PtNode p1, PtNode p2)538 public int compare(PtNode p1, PtNode p2) { 539 if (p1.mChars[0] == p2.mChars[0]) return 0; 540 return p1.mChars[0] < p2.mChars[0] ? -1 : 1; 541 } 542 } 543 final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); 544 545 /** 546 * Finds the insertion index of a character within a node array. 547 */ findInsertionIndex(final PtNodeArray nodeArray, int character)548 private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { 549 final ArrayList<PtNode> data = nodeArray.mData; 550 final PtNode reference = new PtNode(new int[] { character }, 551 null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */, 552 false /* isNotAWord */, false /* isBlacklistEntry */); 553 int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); 554 return result >= 0 ? result : -result - 1; 555 } 556 557 /** 558 * Find the index of a char in a node array, if it exists. 559 * 560 * @param nodeArray the node array to search in. 561 * @param character the character to search for. 562 * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 563 */ findIndexOfChar(final PtNodeArray nodeArray, int character)564 private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { 565 final int insertionIndex = findInsertionIndex(nodeArray, character); 566 if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 567 return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex 568 : CHARACTER_NOT_FOUND_INDEX; 569 } 570 571 /** 572 * Helper method to find a word in a given branch. 573 */ findWordInTree(PtNodeArray nodeArray, final String string)574 public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { 575 int index = 0; 576 final StringBuilder checker = DBG ? new StringBuilder() : null; 577 final int[] codePoints = getCodePoints(string); 578 579 PtNode currentPtNode; 580 do { 581 int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); 582 if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 583 currentPtNode = nodeArray.mData.get(indexOfGroup); 584 585 if (codePoints.length - index < currentPtNode.mChars.length) return null; 586 int newIndex = index; 587 while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { 588 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; 589 newIndex++; 590 } 591 index = newIndex; 592 593 if (DBG) { 594 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); 595 } 596 if (index < codePoints.length) { 597 nodeArray = currentPtNode.mChildren; 598 } 599 } while (null != nodeArray && index < codePoints.length); 600 601 if (index < codePoints.length) return null; 602 if (!currentPtNode.isTerminal()) return null; 603 if (DBG && !string.equals(checker.toString())) return null; 604 return currentPtNode; 605 } 606 607 /** 608 * Helper method to find out whether a word is in the dict or not. 609 */ hasWord(final String s)610 public boolean hasWord(final String s) { 611 if (null == s || "".equals(s)) { 612 throw new RuntimeException("Can't search for a null or empty string"); 613 } 614 return null != findWordInTree(mRootNodeArray, s); 615 } 616 617 /** 618 * Recursively count the number of PtNodes in a given branch of the trie. 619 * 620 * @param nodeArray the parent node. 621 * @return the number of PtNodes in all the branch under this node. 622 */ countPtNodes(final PtNodeArray nodeArray)623 public static int countPtNodes(final PtNodeArray nodeArray) { 624 final int nodeSize = nodeArray.mData.size(); 625 int size = nodeSize; 626 for (int i = nodeSize - 1; i >= 0; --i) { 627 PtNode ptNode = nodeArray.mData.get(i); 628 if (null != ptNode.mChildren) 629 size += countPtNodes(ptNode.mChildren); 630 } 631 return size; 632 } 633 634 /** 635 * Iterator to walk through a dictionary. 636 * 637 * This is purely for convenience. 638 */ 639 public static final class DictionaryIterator implements Iterator<WordProperty> { 640 private static final class Position { 641 public Iterator<PtNode> pos; 642 public int length; Position(ArrayList<PtNode> ptNodes)643 public Position(ArrayList<PtNode> ptNodes) { 644 pos = ptNodes.iterator(); 645 length = 0; 646 } 647 } 648 final StringBuilder mCurrentString; 649 final LinkedList<Position> mPositions; 650 DictionaryIterator(ArrayList<PtNode> ptRoot)651 public DictionaryIterator(ArrayList<PtNode> ptRoot) { 652 mCurrentString = new StringBuilder(); 653 mPositions = new LinkedList<>(); 654 final Position rootPos = new Position(ptRoot); 655 mPositions.add(rootPos); 656 } 657 658 @Override hasNext()659 public boolean hasNext() { 660 for (Position p : mPositions) { 661 if (p.pos.hasNext()) { 662 return true; 663 } 664 } 665 return false; 666 } 667 668 @Override next()669 public WordProperty next() { 670 Position currentPos = mPositions.getLast(); 671 mCurrentString.setLength(currentPos.length); 672 673 do { 674 if (currentPos.pos.hasNext()) { 675 final PtNode currentPtNode = currentPos.pos.next(); 676 currentPos.length = mCurrentString.length(); 677 for (int i : currentPtNode.mChars) { 678 mCurrentString.append(Character.toChars(i)); 679 } 680 if (null != currentPtNode.mChildren) { 681 currentPos = new Position(currentPtNode.mChildren.mData); 682 currentPos.length = mCurrentString.length(); 683 mPositions.addLast(currentPos); 684 } 685 if (currentPtNode.isTerminal()) { 686 return new WordProperty(mCurrentString.toString(), 687 currentPtNode.mProbabilityInfo, 688 currentPtNode.mShortcutTargets, currentPtNode.mBigrams, 689 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); 690 } 691 } else { 692 mPositions.removeLast(); 693 currentPos = mPositions.getLast(); 694 mCurrentString.setLength(mPositions.getLast().length); 695 } 696 } while (true); 697 } 698 699 @Override remove()700 public void remove() { 701 throw new UnsupportedOperationException("Unsupported yet"); 702 } 703 704 } 705 706 /** 707 * Method to return an iterator. 708 * 709 * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 710 * and say : for (Word w : x) {} 711 */ 712 @Override iterator()713 public Iterator<WordProperty> iterator() { 714 return new DictionaryIterator(mRootNodeArray.mData); 715 } 716 } 717