1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.BinaryDictionary;
21 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
22 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
23 
24 import java.io.File;
25 import java.io.FileNotFoundException;
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 
30 /**
31  * An implementation of DictDecoder for version 2 binary dictionary.
32  */
33 // TODO: Separate logics that are used only for testing.
34 @UsedForTesting
35 public class Ver2DictDecoder extends AbstractDictDecoder {
36     /**
37      * A utility class for reading a PtNode.
38      */
39     protected static class PtNodeReader {
readProbabilityInfo(final DictBuffer dictBuffer)40         private static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) {
41             // Ver2 dicts don't contain historical information.
42             return new ProbabilityInfo(dictBuffer.readUnsignedByte());
43         }
44 
readPtNodeOptionFlags(final DictBuffer dictBuffer)45         protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) {
46             return dictBuffer.readUnsignedByte();
47         }
48 
readChildrenAddress(final DictBuffer dictBuffer, final int ptNodeFlags)49         protected static int readChildrenAddress(final DictBuffer dictBuffer,
50                 final int ptNodeFlags) {
51             switch (ptNodeFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
52                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
53                     return dictBuffer.readUnsignedByte();
54                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
55                     return dictBuffer.readUnsignedShort();
56                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
57                     return dictBuffer.readUnsignedInt24();
58                 case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
59                 default:
60                     return FormatSpec.NO_CHILDREN_ADDRESS;
61             }
62         }
63 
64         // Reads shortcuts and returns the read length.
readShortcut(final DictBuffer dictBuffer, final ArrayList<WeightedString> shortcutTargets)65         protected static int readShortcut(final DictBuffer dictBuffer,
66                 final ArrayList<WeightedString> shortcutTargets) {
67             final int pointerBefore = dictBuffer.position();
68             dictBuffer.readUnsignedShort(); // skip the size
69             while (true) {
70                 final int targetFlags = dictBuffer.readUnsignedByte();
71                 final String word = CharEncoding.readString(dictBuffer);
72                 shortcutTargets.add(new WeightedString(word,
73                         targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
74                 if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
75             }
76             return dictBuffer.position() - pointerBefore;
77         }
78 
readBigramAddresses(final DictBuffer dictBuffer, final ArrayList<PendingAttribute> bigrams, final int baseAddress)79         protected static int readBigramAddresses(final DictBuffer dictBuffer,
80                 final ArrayList<PendingAttribute> bigrams, final int baseAddress) {
81             int readLength = 0;
82             int bigramCount = 0;
83             while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
84                 final int bigramFlags = dictBuffer.readUnsignedByte();
85                 ++readLength;
86                 final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE)
87                         ? 1 : -1;
88                 int bigramAddress = baseAddress + readLength;
89                 switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
90                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
91                         bigramAddress += sign * dictBuffer.readUnsignedByte();
92                         readLength += 1;
93                         break;
94                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
95                         bigramAddress += sign * dictBuffer.readUnsignedShort();
96                         readLength += 2;
97                         break;
98                     case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
99                         bigramAddress += sign * dictBuffer.readUnsignedInt24();
100                         readLength += 3;
101                         break;
102                     default:
103                         throw new RuntimeException("Has bigrams with no address");
104                 }
105                 bigrams.add(new PendingAttribute(
106                         bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
107                         bigramAddress));
108                 if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
109             }
110             return readLength;
111         }
112     }
113 
114     protected final File mDictionaryBinaryFile;
115     protected final long mOffset;
116     protected final long mLength;
117     // TODO: Remove mBufferFactory and mDictBuffer from this class members because they are now
118     // used only for testing.
119     private final DictionaryBufferFactory mBufferFactory;
120     protected DictBuffer mDictBuffer;
121 
122     @UsedForTesting
Ver2DictDecoder(final File file, final long offset, final long length, final int factoryFlag)123     /* package */ Ver2DictDecoder(final File file, final long offset, final long length,
124             final int factoryFlag) {
125         mDictionaryBinaryFile = file;
126         mOffset = offset;
127         mLength = length;
128         mDictBuffer = null;
129         if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
130             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
131         } else if ((factoryFlag  & MASK_DICTBUFFER) == USE_BYTEARRAY) {
132             mBufferFactory = new DictionaryBufferFromByteArrayFactory();
133         } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
134             mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
135         } else {
136             mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
137         }
138     }
139 
Ver2DictDecoder(final File file, final long offset, final long length, final DictionaryBufferFactory factory)140     /* package */ Ver2DictDecoder(final File file, final long offset, final long length,
141             final DictionaryBufferFactory factory) {
142         mDictionaryBinaryFile = file;
143         mOffset = offset;
144         mLength = length;
145         mBufferFactory = factory;
146     }
147 
148     @Override
openDictBuffer()149     public void openDictBuffer() throws FileNotFoundException, IOException {
150         mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile);
151     }
152 
153     @Override
isDictBufferOpen()154     public boolean isDictBufferOpen() {
155         return mDictBuffer != null;
156     }
157 
getDictBuffer()158     /* package */ DictBuffer getDictBuffer() {
159         return mDictBuffer;
160     }
161 
162     @UsedForTesting
openAndGetDictBuffer()163     /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException {
164         openDictBuffer();
165         return getDictBuffer();
166     }
167 
168     @Override
readHeader()169     public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException {
170         // dictType is not being used in dicttool. Passing an empty string.
171         final BinaryDictionary binaryDictionary = new BinaryDictionary(
172                 mDictionaryBinaryFile.getAbsolutePath(), mOffset, mLength,
173                 true /* useFullEditDistance */, null /* locale */, "" /* dictType */,
174                 false /* isUpdatable */);
175         final DictionaryHeader header = binaryDictionary.getHeader();
176         binaryDictionary.close();
177         if (header == null) {
178             throw new IOException("Cannot read the dictionary header.");
179         }
180         if (header.mFormatOptions.mVersion != FormatSpec.VERSION2) {
181             throw new UnsupportedFormatException("File header has a wrong version : "
182                     + header.mFormatOptions.mVersion);
183         }
184         if (!isDictBufferOpen()) {
185             openDictBuffer();
186         }
187         // Advance buffer reading position to the head of dictionary body.
188         setPosition(header.mBodyOffset);
189         return header;
190     }
191 
192     // TODO: Make this buffer multi thread safe.
193     private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
194     @Override
readPtNode(final int ptNodePos)195     public PtNodeInfo readPtNode(final int ptNodePos) {
196         int addressPointer = ptNodePos;
197         final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
198         addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
199         final int characters[];
200         if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
201             int index = 0;
202             int character = CharEncoding.readChar(mDictBuffer);
203             addressPointer += CharEncoding.getCharSize(character);
204             while (FormatSpec.INVALID_CHARACTER != character) {
205                 // FusionDictionary is making sure that the length of the word is smaller than
206                 // MAX_WORD_LENGTH.
207                 // So we'll never write past the end of mCharacterBuffer.
208                 mCharacterBuffer[index++] = character;
209                 character = CharEncoding.readChar(mDictBuffer);
210                 addressPointer += CharEncoding.getCharSize(character);
211             }
212             characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
213         } else {
214             final int character = CharEncoding.readChar(mDictBuffer);
215             addressPointer += CharEncoding.getCharSize(character);
216             characters = new int[] { character };
217         }
218         final ProbabilityInfo probabilityInfo;
219         if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
220             probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer);
221             addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE;
222         } else {
223             probabilityInfo = null;
224         }
225         int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags);
226         if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
227             childrenAddress += addressPointer;
228         }
229         addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags);
230         final ArrayList<WeightedString> shortcutTargets;
231         if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
232             // readShortcut will add shortcuts to shortcutTargets.
233             shortcutTargets = new ArrayList<>();
234             addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets);
235         } else {
236             shortcutTargets = null;
237         }
238 
239         final ArrayList<PendingAttribute> bigrams;
240         if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
241             bigrams = new ArrayList<>();
242             addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams,
243                     addressPointer);
244             if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
245                 throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
246                         + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
247             }
248         } else {
249             bigrams = null;
250         }
251         return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo,
252                 childrenAddress, shortcutTargets, bigrams);
253     }
254 
255     @Override
readDictionaryBinary(final boolean deleteDictIfBroken)256     public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken)
257             throws FileNotFoundException, IOException, UnsupportedFormatException {
258         // dictType is not being used in dicttool. Passing an empty string.
259         final BinaryDictionary binaryDictionary = new BinaryDictionary(
260                 mDictionaryBinaryFile.getAbsolutePath(), 0 /* offset */,
261                 mDictionaryBinaryFile.length() /* length */, true /* useFullEditDistance */,
262                 null /* locale */, "" /* dictType */, false /* isUpdatable */);
263         final DictionaryHeader header = readHeader();
264         final FusionDictionary fusionDict =
265                 new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions);
266         int token = 0;
267         final ArrayList<WordProperty> wordProperties = new ArrayList<>();
268         do {
269             final BinaryDictionary.GetNextWordPropertyResult result =
270                     binaryDictionary.getNextWordProperty(token);
271             final WordProperty wordProperty = result.mWordProperty;
272             if (wordProperty == null) {
273                 binaryDictionary.close();
274                 if (deleteDictIfBroken) {
275                     mDictionaryBinaryFile.delete();
276                 }
277                 return null;
278             }
279             wordProperties.add(wordProperty);
280             token = result.mNextToken;
281         } while (token != 0);
282 
283         // Insert unigrams into the fusion dictionary.
284         for (final WordProperty wordProperty : wordProperties) {
285             if (wordProperty.mIsBlacklistEntry) {
286                 fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
287                         wordProperty.mIsNotAWord);
288             } else {
289                 fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
290                         wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
291             }
292         }
293         // Insert bigrams into the fusion dictionary.
294         for (final WordProperty wordProperty : wordProperties) {
295             if (wordProperty.mBigrams == null) {
296                 continue;
297             }
298             final String word0 = wordProperty.mWord;
299             for (final WeightedString bigram : wordProperty.mBigrams) {
300                 fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo);
301             }
302         }
303         binaryDictionary.close();
304         return fusionDict;
305     }
306 
307     @Override
setPosition(int newPos)308     public void setPosition(int newPos) {
309         mDictBuffer.position(newPos);
310     }
311 
312     @Override
getPosition()313     public int getPosition() {
314         return mDictBuffer.position();
315     }
316 
317     @Override
readPtNodeCount()318     public int readPtNodeCount() {
319         return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
320     }
321 }
322