1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.define.DecoderSpecificConstants;
21 import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
22 
23 import java.io.File;
24 import java.io.IOException;
25 import java.io.OutputStream;
26 import java.util.ArrayList;
27 import java.util.Map;
28 import java.util.Stack;
29 
30 public final class BinaryDictIOUtils {
31     private static final boolean DBG = false;
32 
BinaryDictIOUtils()33     private BinaryDictIOUtils() {
34         // This utility class is not publicly instantiable.
35     }
36 
37     /**
38      * Returns new dictionary decoder.
39      *
40      * @param dictFile the dictionary file.
41      * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
42      * @return new dictionary decoder if the dictionary file exists, otherwise null.
43      */
getDictDecoder(final File dictFile, final long offset, final long length, final int bufferType)44     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
45             final long length, final int bufferType) {
46         return new Ver4DictDecoder(dictFile);
47     }
48 
getDictDecoder(final File dictFile, final long offset, final long length, final DictionaryBufferFactory factory)49     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
50             final long length, final DictionaryBufferFactory factory) {
51         return new Ver4DictDecoder(dictFile);
52     }
53 
getDictDecoder(final File dictFile, final long offset, final long length)54     public static DictDecoder getDictDecoder(final File dictFile, final long offset,
55             final long length) {
56         return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
57     }
58 
59     private static final class Position {
60         public static final int NOT_READ_PTNODE_COUNT = -1;
61 
62         public int mAddress;
63         public int mNumOfPtNode;
64         public int mPosition;
65         public int mLength;
66 
Position(int address, int length)67         public Position(int address, int length) {
68             mAddress = address;
69             mLength = length;
70             mNumOfPtNode = NOT_READ_PTNODE_COUNT;
71         }
72     }
73 
74     /**
75      * Retrieves all node arrays without recursive call.
76      */
readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, final int bodyOffset, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams)77     private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
78             final int bodyOffset, final Map<Integer, String> words,
79             final Map<Integer, Integer> frequencies,
80             final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
81         int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
82 
83         Stack<Position> stack = new Stack<>();
84         int index = 0;
85 
86         Position initPos = new Position(bodyOffset, 0);
87         stack.push(initPos);
88 
89         while (!stack.empty()) {
90             Position p = stack.peek();
91 
92             if (DBG) {
93                 MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
94                         p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
95             }
96 
97             if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
98             if (index != p.mLength) index = p.mLength;
99 
100             if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
101                 p.mNumOfPtNode = dictDecoder.readPtNodeCount();
102                 p.mAddress = dictDecoder.getPosition();
103                 p.mPosition = 0;
104             }
105             if (p.mNumOfPtNode == 0) {
106                 stack.pop();
107                 continue;
108             }
109             final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
110             for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
111                 pushedChars[index++] = ptNodeInfo.mCharacters[i];
112             }
113             p.mPosition++;
114             if (ptNodeInfo.isTerminal()) {// found word
115                 words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
116                 frequencies.put(
117                         ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
118                 if (ptNodeInfo.mBigrams != null) {
119                     bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
120                 }
121             }
122 
123             if (p.mPosition == p.mNumOfPtNode) {
124                 stack.pop();
125             } else {
126                 // The PtNode array has more PtNodes.
127                 p.mAddress = dictDecoder.getPosition();
128             }
129 
130             if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
131                 final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
132                 stack.push(childrenPos);
133             }
134         }
135     }
136 
137     /**
138      * Reads unigrams and bigrams from the binary file.
139      * Doesn't store a full memory representation of the dictionary.
140      *
141      * @param dictDecoder the dict decoder.
142      * @param words the map to store the address as a key and the word as a value.
143      * @param frequencies the map to store the address as a key and the frequency as a value.
144      * @param bigrams the map to store the address as a key and the list of address as a value.
145      * @throws IOException if the file can't be read.
146      * @throws UnsupportedFormatException if the format of the file is not recognized.
147      */
readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams)148     /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
149             final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
150             final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
151             UnsupportedFormatException {
152         // Read header
153         final DictionaryHeader header = dictDecoder.readHeader();
154         readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
155             frequencies, bigrams);
156     }
157 
158     /**
159      * Gets the address of the last PtNode of the exact matching word in the dictionary.
160      * If no match is found, returns NOT_VALID_WORD.
161      *
162      * @param dictDecoder the dict decoder.
163      * @param word the word we search for.
164      * @return the address of the terminal node.
165      * @throws IOException if the file can't be read.
166      * @throws UnsupportedFormatException if the format of the file is not recognized.
167      */
168     @UsedForTesting
getTerminalPosition(final DictDecoder dictDecoder, final String word)169     /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
170             final String word) throws IOException, UnsupportedFormatException {
171         if (word == null) return FormatSpec.NOT_VALID_WORD;
172         dictDecoder.setPosition(0);
173         dictDecoder.readHeader();
174         int wordPos = 0;
175         final int wordLen = word.codePointCount(0, word.length());
176         for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
177             if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
178 
179             do {
180                 final int ptNodeCount = dictDecoder.readPtNodeCount();
181                 boolean foundNextPtNode = false;
182                 for (int i = 0; i < ptNodeCount; ++i) {
183                     final int ptNodePos = dictDecoder.getPosition();
184                     final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
185                     boolean same = true;
186                     for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
187                             p < currentInfo.mCharacters.length;
188                             ++p, j = word.offsetByCodePoints(j, 1)) {
189                         if (wordPos + p >= wordLen
190                                 || word.codePointAt(j) != currentInfo.mCharacters[p]) {
191                             same = false;
192                             break;
193                         }
194                     }
195 
196                     if (same) {
197                         // found the PtNode matches the word.
198                         if (wordPos + currentInfo.mCharacters.length == wordLen) {
199                             return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD;
200                         }
201                         wordPos += currentInfo.mCharacters.length;
202                         if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
203                             return FormatSpec.NOT_VALID_WORD;
204                         }
205                         foundNextPtNode = true;
206                         dictDecoder.setPosition(currentInfo.mChildrenAddress);
207                         break;
208                     }
209                 }
210                 if (foundNextPtNode) break;
211                 return FormatSpec.NOT_VALID_WORD;
212             } while(true);
213         }
214         return FormatSpec.NOT_VALID_WORD;
215     }
216 
217     /**
218      * Writes a PtNodeCount to the stream.
219      *
220      * @param destination the stream to write.
221      * @param ptNodeCount the count.
222      * @return the size written in bytes.
223      */
224     @UsedForTesting
writePtNodeCount(final OutputStream destination, final int ptNodeCount)225     static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
226             throws IOException {
227         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
228         // the count must fit on one byte or two bytes.
229         // Please see comments in FormatSpec.
230         if (countSize != 1 && countSize != 2) {
231             throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
232         }
233         final int encodedPtNodeCount = (countSize == 2) ?
234                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
235         BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
236         return countSize;
237     }
238 
239     /**
240      * Helper method to hide the actual value of the no children address.
241      */
hasChildrenAddress(final int address)242     public static boolean hasChildrenAddress(final int address) {
243         return FormatSpec.NO_CHILDREN_ADDRESS != address;
244     }
245 
246     /**
247      * Compute the binary size of the node count
248      * @param count the node count
249      * @return the size of the node count, either 1 or 2 bytes.
250      */
getPtNodeCountSize(final int count)251     public static int getPtNodeCountSize(final int count) {
252         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
253             return 1;
254         } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
255             return 2;
256         } else {
257             throw new RuntimeException("Can't have more than "
258                     + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
259                     + count + ")");
260         }
261     }
262 
getChildrenAddressSize(final int optionFlags)263     static int getChildrenAddressSize(final int optionFlags) {
264         switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
265             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
266                 return 1;
267             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
268                 return 2;
269             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
270                 return 3;
271             case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
272             default:
273                 return 0;
274         }
275     }
276 
277     /**
278      * Calculate bigram frequency from compressed value
279      *
280      * @param unigramFrequency
281      * @param bigramFrequency compressed frequency
282      * @return approximate bigram frequency
283      */
284     @UsedForTesting
reconstructBigramFrequency(final int unigramFrequency, final int bigramFrequency)285     public static int reconstructBigramFrequency(final int unigramFrequency,
286             final int bigramFrequency) {
287         final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
288                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
289         final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
290         return (int)resultFreqFloat;
291     }
292 }
293