1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.Constants;
21 import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
22 
23 import java.io.File;
24 import java.io.IOException;
25 import java.io.OutputStream;
26 import java.util.ArrayList;
27 import java.util.Map;
28 import java.util.Stack;
29 
30 public final class BinaryDictIOUtils {
31     private static final boolean DBG = false;
32 
BinaryDictIOUtils()33     private BinaryDictIOUtils() {
34         // This utility class is not publicly instantiable.
35     }
36 
37     /**
38      * Returns new dictionary decoder.
39      *
40      * @param dictFile the dictionary file.
41      * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
42      * @return new dictionary decoder if the dictionary file exists, otherwise null.
43      */
getDictDecoder(final File dictFile, final long offset, final long length, final int bufferType)44     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
45             final long length, final int bufferType) {
46         if (dictFile.isDirectory()) {
47             return new Ver4DictDecoder(dictFile, bufferType);
48         } else if (dictFile.isFile()) {
49             return new Ver2DictDecoder(dictFile, offset, length, bufferType);
50         }
51         return null;
52     }
53 
getDictDecoder(final File dictFile, final long offset, final long length, final DictionaryBufferFactory factory)54     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
55             final long length, final DictionaryBufferFactory factory) {
56         if (dictFile.isDirectory()) {
57             return new Ver4DictDecoder(dictFile, factory);
58         } else if (dictFile.isFile()) {
59             return new Ver2DictDecoder(dictFile, offset, length, factory);
60         }
61         return null;
62     }
63 
getDictDecoder(final File dictFile, final long offset, final long length)64     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
65             final long length) {
66         return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
67     }
68 
69     private static final class Position {
70         public static final int NOT_READ_PTNODE_COUNT = -1;
71 
72         public int mAddress;
73         public int mNumOfPtNode;
74         public int mPosition;
75         public int mLength;
76 
Position(int address, int length)77         public Position(int address, int length) {
78             mAddress = address;
79             mLength = length;
80             mNumOfPtNode = NOT_READ_PTNODE_COUNT;
81         }
82     }
83 
84     /**
85      * Retrieves all node arrays without recursive call.
86      */
readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, final int bodyOffset, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams)87     private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
88             final int bodyOffset, final Map<Integer, String> words,
89             final Map<Integer, Integer> frequencies,
90             final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
91         int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
92 
93         Stack<Position> stack = new Stack<>();
94         int index = 0;
95 
96         Position initPos = new Position(bodyOffset, 0);
97         stack.push(initPos);
98 
99         while (!stack.empty()) {
100             Position p = stack.peek();
101 
102             if (DBG) {
103                 MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
104                         p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
105             }
106 
107             if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
108             if (index != p.mLength) index = p.mLength;
109 
110             if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
111                 p.mNumOfPtNode = dictDecoder.readPtNodeCount();
112                 p.mAddress = dictDecoder.getPosition();
113                 p.mPosition = 0;
114             }
115             if (p.mNumOfPtNode == 0) {
116                 stack.pop();
117                 continue;
118             }
119             final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
120             for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
121                 pushedChars[index++] = ptNodeInfo.mCharacters[i];
122             }
123             p.mPosition++;
124             if (ptNodeInfo.isTerminal()) {// found word
125                 words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
126                 frequencies.put(
127                         ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
128                 if (ptNodeInfo.mBigrams != null) {
129                     bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
130                 }
131             }
132 
133             if (p.mPosition == p.mNumOfPtNode) {
134                 stack.pop();
135             } else {
136                 // The PtNode array has more PtNodes.
137                 p.mAddress = dictDecoder.getPosition();
138             }
139 
140             if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
141                 final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
142                 stack.push(childrenPos);
143             }
144         }
145     }
146 
147     /**
148      * Reads unigrams and bigrams from the binary file.
149      * Doesn't store a full memory representation of the dictionary.
150      *
151      * @param dictDecoder the dict decoder.
152      * @param words the map to store the address as a key and the word as a value.
153      * @param frequencies the map to store the address as a key and the frequency as a value.
154      * @param bigrams the map to store the address as a key and the list of address as a value.
155      * @throws IOException if the file can't be read.
156      * @throws UnsupportedFormatException if the format of the file is not recognized.
157      */
readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams)158     /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
159             final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
160             final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
161             UnsupportedFormatException {
162         // Read header
163         final DictionaryHeader header = dictDecoder.readHeader();
164         readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
165             frequencies, bigrams);
166     }
167 
168     /**
169      * Gets the address of the last PtNode of the exact matching word in the dictionary.
170      * If no match is found, returns NOT_VALID_WORD.
171      *
172      * @param dictDecoder the dict decoder.
173      * @param word the word we search for.
174      * @return the address of the terminal node.
175      * @throws IOException if the file can't be read.
176      * @throws UnsupportedFormatException if the format of the file is not recognized.
177      */
178     @UsedForTesting
getTerminalPosition(final DictDecoder dictDecoder, final String word)179     /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
180             final String word) throws IOException, UnsupportedFormatException {
181         if (word == null) return FormatSpec.NOT_VALID_WORD;
182         dictDecoder.setPosition(0);
183         dictDecoder.readHeader();
184         int wordPos = 0;
185         final int wordLen = word.codePointCount(0, word.length());
186         for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
187             if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
188 
189             do {
190                 final int ptNodeCount = dictDecoder.readPtNodeCount();
191                 boolean foundNextPtNode = false;
192                 for (int i = 0; i < ptNodeCount; ++i) {
193                     final int ptNodePos = dictDecoder.getPosition();
194                     final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
195                     boolean same = true;
196                     for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
197                             p < currentInfo.mCharacters.length;
198                             ++p, j = word.offsetByCodePoints(j, 1)) {
199                         if (wordPos + p >= wordLen
200                                 || word.codePointAt(j) != currentInfo.mCharacters[p]) {
201                             same = false;
202                             break;
203                         }
204                     }
205 
206                     if (same) {
207                         // found the PtNode matches the word.
208                         if (wordPos + currentInfo.mCharacters.length == wordLen) {
209                             if (!currentInfo.isTerminal()) {
210                                 return FormatSpec.NOT_VALID_WORD;
211                             } else {
212                                 return ptNodePos;
213                             }
214                         }
215                         wordPos += currentInfo.mCharacters.length;
216                         if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
217                             return FormatSpec.NOT_VALID_WORD;
218                         }
219                         foundNextPtNode = true;
220                         dictDecoder.setPosition(currentInfo.mChildrenAddress);
221                         break;
222                     }
223                 }
224                 if (foundNextPtNode) break;
225                 return FormatSpec.NOT_VALID_WORD;
226             } while(true);
227         }
228         return FormatSpec.NOT_VALID_WORD;
229     }
230 
231     /**
232      * Writes a PtNodeCount to the stream.
233      *
234      * @param destination the stream to write.
235      * @param ptNodeCount the count.
236      * @return the size written in bytes.
237      */
238     @UsedForTesting
writePtNodeCount(final OutputStream destination, final int ptNodeCount)239     static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
240             throws IOException {
241         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
242         // the count must fit on one byte or two bytes.
243         // Please see comments in FormatSpec.
244         if (countSize != 1 && countSize != 2) {
245             throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
246         }
247         final int encodedPtNodeCount = (countSize == 2) ?
248                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
249         BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
250         return countSize;
251     }
252 
253     /**
254      * Helper method to hide the actual value of the no children address.
255      */
hasChildrenAddress(final int address)256     public static boolean hasChildrenAddress(final int address) {
257         return FormatSpec.NO_CHILDREN_ADDRESS != address;
258     }
259 
260     /**
261      * Compute the binary size of the node count
262      * @param count the node count
263      * @return the size of the node count, either 1 or 2 bytes.
264      */
getPtNodeCountSize(final int count)265     public static int getPtNodeCountSize(final int count) {
266         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
267             return 1;
268         } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
269             return 2;
270         } else {
271             throw new RuntimeException("Can't have more than "
272                     + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
273                     + count + ")");
274         }
275     }
276 
getChildrenAddressSize(final int optionFlags)277     static int getChildrenAddressSize(final int optionFlags) {
278         switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
279             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
280                 return 1;
281             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
282                 return 2;
283             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
284                 return 3;
285             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
286             default:
287                 return 0;
288         }
289     }
290 
291     /**
292      * Calculate bigram frequency from compressed value
293      *
294      * @param unigramFrequency
295      * @param bigramFrequency compressed frequency
296      * @return approximate bigram frequency
297      */
298     @UsedForTesting
reconstructBigramFrequency(final int unigramFrequency, final int bigramFrequency)299     public static int reconstructBigramFrequency(final int unigramFrequency,
300             final int bigramFrequency) {
301         final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
302                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
303         final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
304         return (int)resultFreqFloat;
305     }
306 }
307