1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; 21 import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; 22 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 23 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; 24 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 25 26 import java.io.File; 27 import java.io.FileNotFoundException; 28 import java.io.FileOutputStream; 29 import java.io.IOException; 30 import java.io.OutputStream; 31 import java.util.ArrayList; 32 import java.util.Collections; 33 import java.util.Comparator; 34 import java.util.HashMap; 35 import java.util.Iterator; 36 import java.util.Map.Entry; 37 import java.util.Objects; 38 39 /** 40 * An implementation of DictEncoder for version 2 binary dictionary. 41 */ 42 @UsedForTesting 43 public class Ver2DictEncoder implements DictEncoder { 44 45 private final File mDictFile; 46 private OutputStream mOutStream; 47 private byte[] mBuffer; 48 private int mPosition; 49 private final int mCodePointTableMode; 50 public static final int CODE_POINT_TABLE_OFF = 0; 51 public static final int CODE_POINT_TABLE_ON = 1; 52 53 @UsedForTesting Ver2DictEncoder(final File dictFile, final int codePointTableMode)54 public Ver2DictEncoder(final File dictFile, final int codePointTableMode) { 55 mDictFile = dictFile; 56 mOutStream = null; 57 mBuffer = null; 58 mCodePointTableMode = codePointTableMode; 59 } 60 61 // This constructor is used only by BinaryDictOffdeviceUtilsTests. 62 // If you want to use this in the production code, you should consider keeping consistency of 63 // the interface of Ver3DictDecoder by using factory. 64 @UsedForTesting Ver2DictEncoder(final OutputStream outStream)65 public Ver2DictEncoder(final OutputStream outStream) { 66 mDictFile = null; 67 mOutStream = outStream; 68 mCodePointTableMode = CODE_POINT_TABLE_OFF; 69 } 70 openStream()71 private void openStream() throws FileNotFoundException { 72 mOutStream = new FileOutputStream(mDictFile); 73 } 74 close()75 private void close() throws IOException { 76 if (mOutStream != null) { 77 mOutStream.close(); 78 mOutStream = null; 79 } 80 } 81 82 // Package for testing makeCodePointTable(final FusionDictionary dict)83 static CodePointTable makeCodePointTable(final FusionDictionary dict) { 84 final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); 85 for (final WordProperty word : dict) { 86 // Store per code point occurrence 87 final String wordString = word.mWord; 88 for (int i = 0; i < wordString.length(); ++i) { 89 final int codePoint = Character.codePointAt(wordString, i); 90 if (codePointOccurrenceCounts.containsKey(codePoint)) { 91 codePointOccurrenceCounts.put(codePoint, 92 codePointOccurrenceCounts.get(codePoint) + 1); 93 } else { 94 codePointOccurrenceCounts.put(codePoint, 1); 95 } 96 } 97 } 98 final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = 99 new ArrayList<>(codePointOccurrenceCounts.entrySet()); 100 // Descending order sort by occurrence (value side) 101 Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { 102 @Override 103 public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { 104 if (!Objects.equals(a.getValue(), b.getValue())) { 105 return b.getValue().compareTo(a.getValue()); 106 } 107 return b.getKey().compareTo(a.getKey()); 108 } 109 }); 110 int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; 111 // Temporary map for writing of nodes 112 final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); 113 for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { 114 // Put a relation from the original code point to the one byte code. 115 codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); 116 if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { 117 break; 118 } 119 } 120 // codePointToOneByteCodeMap for writing the trie 121 // codePointOccurrenceArray for writing the header 122 return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); 123 } 124 125 @Override writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)126 public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) 127 throws IOException, UnsupportedFormatException { 128 // We no longer support anything but the latest version of v2. 129 if (formatOptions.mVersion != FormatSpec.VERSION202) { 130 throw new UnsupportedFormatException( 131 "The given format options has wrong version number : " 132 + formatOptions.mVersion); 133 } 134 135 if (mOutStream == null) { 136 openStream(); 137 } 138 139 // Make code point conversion table ordered by occurrence of code points 140 // Version 201 or later have codePointTable 141 final CodePointTable codePointTable; 142 if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion 143 < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) { 144 codePointTable = new CodePointTable(); 145 } else { 146 codePointTable = makeCodePointTable(dict); 147 } 148 149 BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, 150 codePointTable.mCodePointOccurrenceArray); 151 152 // Addresses are limited to 3 bytes, but since addresses can be relative to each node 153 // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding 154 // the order of the PtNode arrays becomes a quite complicated problem, because though the 155 // dictionary itself does not have a size limit, each node array must still be within 16MB 156 // of all its children and parents. As long as this is ensured, the dictionary file may 157 // grow to any size. 158 159 // Leave the choice of the optimal node order to the flattenTree function. 160 MakedictLog.i("Flattening the tree..."); 161 ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); 162 163 MakedictLog.i("Computing addresses..."); 164 BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, 165 codePointTable.mCodePointToOneByteCodeMap); 166 MakedictLog.i("Checking PtNode array..."); 167 if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); 168 169 // Create a buffer that matches the final dictionary size. 170 final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); 171 final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; 172 mBuffer = new byte[bufferSize]; 173 174 MakedictLog.i("Writing file..."); 175 176 for (PtNodeArray nodeArray : flatNodes) { 177 BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, 178 codePointTable.mCodePointToOneByteCodeMap); 179 } 180 if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); 181 mOutStream.write(mBuffer, 0, mPosition); 182 183 MakedictLog.i("Done"); 184 close(); 185 } 186 187 @Override setPosition(final int position)188 public void setPosition(final int position) { 189 if (mBuffer == null || position < 0 || position >= mBuffer.length) return; 190 mPosition = position; 191 } 192 193 @Override getPosition()194 public int getPosition() { 195 return mPosition; 196 } 197 198 @Override writePtNodeCount(final int ptNodeCount)199 public void writePtNodeCount(final int ptNodeCount) { 200 final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); 201 if (countSize != 1 && countSize != 2) { 202 throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); 203 } 204 final int encodedPtNodeCount = (countSize == 2) ? 205 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; 206 mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, 207 countSize); 208 } 209 writePtNodeFlags(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap)210 private void writePtNodeFlags(final PtNode ptNode, 211 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 212 final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, 213 codePointToOneByteCodeMap); 214 mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, 215 BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), 216 FormatSpec.PTNODE_FLAGS_SIZE); 217 } 218 writeCharacters(final int[] codePoints, final boolean hasSeveralChars, final HashMap<Integer, Integer> codePointToOneByteCodeMap)219 private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars, 220 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 221 mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition, 222 codePointToOneByteCodeMap); 223 if (hasSeveralChars) { 224 mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; 225 } 226 } 227 writeFrequency(final int frequency)228 private void writeFrequency(final int frequency) { 229 if (frequency >= 0) { 230 mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, 231 FormatSpec.PTNODE_FREQUENCY_SIZE); 232 } 233 } 234 writeChildrenPosition(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap)235 private void writeChildrenPosition(final PtNode ptNode, 236 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 237 final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, 238 codePointToOneByteCodeMap); 239 mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, 240 childrenPos); 241 } 242 243 /** 244 * Write a bigram attributes list to mBuffer. 245 * 246 * @param bigrams the bigram attributes list. 247 * @param dict the dictionary the node array is a part of (for relative offsets). 248 */ writeBigrams(final ArrayList<WeightedString> bigrams, final FusionDictionary dict)249 private void writeBigrams(final ArrayList<WeightedString> bigrams, 250 final FusionDictionary dict) { 251 if (bigrams == null) return; 252 253 final Iterator<WeightedString> bigramIterator = bigrams.iterator(); 254 while (bigramIterator.hasNext()) { 255 final WeightedString bigram = bigramIterator.next(); 256 final PtNode target = 257 FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); 258 final int addressOfBigram = target.mCachedAddressAfterUpdate; 259 final int unigramFrequencyForThisWord = target.getProbability(); 260 final int offset = addressOfBigram 261 - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); 262 final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), 263 offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord); 264 mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, 265 FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); 266 mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, 267 Math.abs(offset)); 268 } 269 } 270 271 @Override writePtNode(final PtNode ptNode, final FusionDictionary dict, final HashMap<Integer, Integer> codePointToOneByteCodeMap)272 public void writePtNode(final PtNode ptNode, final FusionDictionary dict, 273 final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 274 writePtNodeFlags(ptNode, codePointToOneByteCodeMap); 275 writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap); 276 writeFrequency(ptNode.getProbability()); 277 writeChildrenPosition(ptNode, codePointToOneByteCodeMap); 278 writeBigrams(ptNode.mBigrams, dict); 279 } 280 } 281