1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.Constants; 21 import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; 22 23 import java.io.File; 24 import java.io.IOException; 25 import java.io.OutputStream; 26 import java.util.ArrayList; 27 import java.util.Map; 28 import java.util.Stack; 29 30 public final class BinaryDictIOUtils { 31 private static final boolean DBG = false; 32 BinaryDictIOUtils()33 private BinaryDictIOUtils() { 34 // This utility class is not publicly instantiable. 35 } 36 37 /** 38 * Returns new dictionary decoder. 39 * 40 * @param dictFile the dictionary file. 41 * @param bufferType The type of buffer, as one of USE_* in DictDecoder. 42 * @return new dictionary decoder if the dictionary file exists, otherwise null. 43 */ getDictDecoder(final File dictFile, final long offset, final long length, final int bufferType)44 public static DictDecoder getDictDecoder(final File dictFile, final long offset, 45 final long length, final int bufferType) { 46 if (dictFile.isDirectory()) { 47 return new Ver4DictDecoder(dictFile, bufferType); 48 } else if (dictFile.isFile()) { 49 return new Ver2DictDecoder(dictFile, offset, length, bufferType); 50 } 51 return null; 52 } 53 getDictDecoder(final File dictFile, final long offset, final long length, final DictionaryBufferFactory factory)54 public static DictDecoder getDictDecoder(final File dictFile, final long offset, 55 final long length, final DictionaryBufferFactory factory) { 56 if (dictFile.isDirectory()) { 57 return new Ver4DictDecoder(dictFile, factory); 58 } else if (dictFile.isFile()) { 59 return new Ver2DictDecoder(dictFile, offset, length, factory); 60 } 61 return null; 62 } 63 getDictDecoder(final File dictFile, final long offset, final long length)64 public static DictDecoder getDictDecoder(final File dictFile, final long offset, 65 final long length) { 66 return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER); 67 } 68 69 private static final class Position { 70 public static final int NOT_READ_PTNODE_COUNT = -1; 71 72 public int mAddress; 73 public int mNumOfPtNode; 74 public int mPosition; 75 public int mLength; 76 Position(int address, int length)77 public Position(int address, int length) { 78 mAddress = address; 79 mLength = length; 80 mNumOfPtNode = NOT_READ_PTNODE_COUNT; 81 } 82 } 83 84 /** 85 * Retrieves all node arrays without recursive call. 86 */ readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, final int bodyOffset, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams)87 private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, 88 final int bodyOffset, final Map<Integer, String> words, 89 final Map<Integer, Integer> frequencies, 90 final Map<Integer, ArrayList<PendingAttribute>> bigrams) { 91 int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; 92 93 Stack<Position> stack = new Stack<>(); 94 int index = 0; 95 96 Position initPos = new Position(bodyOffset, 0); 97 stack.push(initPos); 98 99 while (!stack.empty()) { 100 Position p = stack.peek(); 101 102 if (DBG) { 103 MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" + 104 p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); 105 } 106 107 if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); 108 if (index != p.mLength) index = p.mLength; 109 110 if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { 111 p.mNumOfPtNode = dictDecoder.readPtNodeCount(); 112 p.mAddress = dictDecoder.getPosition(); 113 p.mPosition = 0; 114 } 115 if (p.mNumOfPtNode == 0) { 116 stack.pop(); 117 continue; 118 } 119 final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress); 120 for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) { 121 pushedChars[index++] = ptNodeInfo.mCharacters[i]; 122 } 123 p.mPosition++; 124 if (ptNodeInfo.isTerminal()) {// found word 125 words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index)); 126 frequencies.put( 127 ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability); 128 if (ptNodeInfo.mBigrams != null) { 129 bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams); 130 } 131 } 132 133 if (p.mPosition == p.mNumOfPtNode) { 134 stack.pop(); 135 } else { 136 // The PtNode array has more PtNodes. 137 p.mAddress = dictDecoder.getPosition(); 138 } 139 140 if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) { 141 final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index); 142 stack.push(childrenPos); 143 } 144 } 145 } 146 147 /** 148 * Reads unigrams and bigrams from the binary file. 149 * Doesn't store a full memory representation of the dictionary. 150 * 151 * @param dictDecoder the dict decoder. 152 * @param words the map to store the address as a key and the word as a value. 153 * @param frequencies the map to store the address as a key and the frequency as a value. 154 * @param bigrams the map to store the address as a key and the list of address as a value. 155 * @throws IOException if the file can't be read. 156 * @throws UnsupportedFormatException if the format of the file is not recognized. 157 */ readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams)158 /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, 159 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 160 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 161 UnsupportedFormatException { 162 // Read header 163 final DictionaryHeader header = dictDecoder.readHeader(); 164 readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words, 165 frequencies, bigrams); 166 } 167 168 /** 169 * Gets the address of the last PtNode of the exact matching word in the dictionary. 170 * If no match is found, returns NOT_VALID_WORD. 171 * 172 * @param dictDecoder the dict decoder. 173 * @param word the word we search for. 174 * @return the address of the terminal node. 175 * @throws IOException if the file can't be read. 176 * @throws UnsupportedFormatException if the format of the file is not recognized. 177 */ 178 @UsedForTesting getTerminalPosition(final DictDecoder dictDecoder, final String word)179 /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, 180 final String word) throws IOException, UnsupportedFormatException { 181 if (word == null) return FormatSpec.NOT_VALID_WORD; 182 dictDecoder.setPosition(0); 183 dictDecoder.readHeader(); 184 int wordPos = 0; 185 final int wordLen = word.codePointCount(0, word.length()); 186 for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { 187 if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; 188 189 do { 190 final int ptNodeCount = dictDecoder.readPtNodeCount(); 191 boolean foundNextPtNode = false; 192 for (int i = 0; i < ptNodeCount; ++i) { 193 final int ptNodePos = dictDecoder.getPosition(); 194 final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos); 195 boolean same = true; 196 for (int p = 0, j = word.offsetByCodePoints(0, wordPos); 197 p < currentInfo.mCharacters.length; 198 ++p, j = word.offsetByCodePoints(j, 1)) { 199 if (wordPos + p >= wordLen 200 || word.codePointAt(j) != currentInfo.mCharacters[p]) { 201 same = false; 202 break; 203 } 204 } 205 206 if (same) { 207 // found the PtNode matches the word. 208 if (wordPos + currentInfo.mCharacters.length == wordLen) { 209 if (!currentInfo.isTerminal()) { 210 return FormatSpec.NOT_VALID_WORD; 211 } else { 212 return ptNodePos; 213 } 214 } 215 wordPos += currentInfo.mCharacters.length; 216 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 217 return FormatSpec.NOT_VALID_WORD; 218 } 219 foundNextPtNode = true; 220 dictDecoder.setPosition(currentInfo.mChildrenAddress); 221 break; 222 } 223 } 224 if (foundNextPtNode) break; 225 return FormatSpec.NOT_VALID_WORD; 226 } while(true); 227 } 228 return FormatSpec.NOT_VALID_WORD; 229 } 230 231 /** 232 * Writes a PtNodeCount to the stream. 233 * 234 * @param destination the stream to write. 235 * @param ptNodeCount the count. 236 * @return the size written in bytes. 237 */ 238 @UsedForTesting writePtNodeCount(final OutputStream destination, final int ptNodeCount)239 static int writePtNodeCount(final OutputStream destination, final int ptNodeCount) 240 throws IOException { 241 final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); 242 // the count must fit on one byte or two bytes. 243 // Please see comments in FormatSpec. 244 if (countSize != 1 && countSize != 2) { 245 throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); 246 } 247 final int encodedPtNodeCount = (countSize == 2) ? 248 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; 249 BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize); 250 return countSize; 251 } 252 253 /** 254 * Helper method to hide the actual value of the no children address. 255 */ hasChildrenAddress(final int address)256 public static boolean hasChildrenAddress(final int address) { 257 return FormatSpec.NO_CHILDREN_ADDRESS != address; 258 } 259 260 /** 261 * Compute the binary size of the node count 262 * @param count the node count 263 * @return the size of the node count, either 1 or 2 bytes. 264 */ getPtNodeCountSize(final int count)265 public static int getPtNodeCountSize(final int count) { 266 if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) { 267 return 1; 268 } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) { 269 return 2; 270 } else { 271 throw new RuntimeException("Can't have more than " 272 + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found " 273 + count + ")"); 274 } 275 } 276 getChildrenAddressSize(final int optionFlags)277 static int getChildrenAddressSize(final int optionFlags) { 278 switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { 279 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: 280 return 1; 281 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: 282 return 2; 283 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: 284 return 3; 285 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: 286 default: 287 return 0; 288 } 289 } 290 291 /** 292 * Calculate bigram frequency from compressed value 293 * 294 * @param unigramFrequency 295 * @param bigramFrequency compressed frequency 296 * @return approximate bigram frequency 297 */ 298 @UsedForTesting reconstructBigramFrequency(final int unigramFrequency, final int bigramFrequency)299 public static int reconstructBigramFrequency(final int unigramFrequency, 300 final int bigramFrequency) { 301 final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) 302 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); 303 final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); 304 return (int)resultFreqFloat; 305 } 306 } 307