1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
21 
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.OutputStream;
25 import java.nio.ByteBuffer;
26 import java.util.HashMap;
27 import java.util.LinkedList;
28 
29 import javax.annotation.Nonnull;
30 
31 /**
32  * Decodes binary files for a FusionDictionary.
33  *
34  * All the methods in this class are static.
35  *
36  * TODO: Move this file to makedict/internal.
37  * TODO: Rename this class to DictDecoderUtils.
38  */
39 public final class BinaryDictDecoderUtils {
BinaryDictDecoderUtils()40     private BinaryDictDecoderUtils() {
41         // This utility class is not publicly instantiable.
42     }
43 
44     @UsedForTesting
45     public interface DictBuffer {
readUnsignedByte()46         public int readUnsignedByte();
readUnsignedShort()47         public int readUnsignedShort();
readUnsignedInt24()48         public int readUnsignedInt24();
readInt()49         public int readInt();
position()50         public int position();
position(int newPosition)51         public void position(int newPosition);
52         @UsedForTesting
put(final byte b)53         public void put(final byte b);
limit()54         public int limit();
55         @UsedForTesting
capacity()56         public int capacity();
57     }
58 
59     public static final class ByteBufferDictBuffer implements DictBuffer {
60         private ByteBuffer mBuffer;
61 
ByteBufferDictBuffer(final ByteBuffer buffer)62         public ByteBufferDictBuffer(final ByteBuffer buffer) {
63             mBuffer = buffer;
64         }
65 
66         @Override
readUnsignedByte()67         public int readUnsignedByte() {
68             return mBuffer.get() & 0xFF;
69         }
70 
71         @Override
readUnsignedShort()72         public int readUnsignedShort() {
73             return mBuffer.getShort() & 0xFFFF;
74         }
75 
76         @Override
readUnsignedInt24()77         public int readUnsignedInt24() {
78             final int retval = readUnsignedByte();
79             return (retval << 16) + readUnsignedShort();
80         }
81 
82         @Override
readInt()83         public int readInt() {
84             return mBuffer.getInt();
85         }
86 
87         @Override
position()88         public int position() {
89             return mBuffer.position();
90         }
91 
92         @Override
position(int newPos)93         public void position(int newPos) {
94             mBuffer.position(newPos);
95         }
96 
97         @Override
put(final byte b)98         public void put(final byte b) {
99             mBuffer.put(b);
100         }
101 
102         @Override
limit()103         public int limit() {
104             return mBuffer.limit();
105         }
106 
107         @Override
capacity()108         public int capacity() {
109             return mBuffer.capacity();
110         }
111     }
112 
113     /**
114      * A class grouping utility function for our specific character encoding.
115      */
116     static final class CharEncoding {
117 
118         /**
119          * Helper method to find out whether this code fits on one byte
120          */
fitsOnOneByte(final int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap)121         private static boolean fitsOnOneByte(final int character,
122                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
123             int codePoint = character;
124             if (codePointToOneByteCodeMap != null) {
125                 if (codePointToOneByteCodeMap.containsKey(character)) {
126                     codePoint = codePointToOneByteCodeMap.get(character);
127                 }
128             }
129             return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE
130                     && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
131         }
132 
133         /**
134          * Compute the size of a character given its character code.
135          *
136          * Char format is:
137          * 1 byte = bbbbbbbb match
138          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
139          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
140          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
141          *       00011111 would be outside unicode.
142          * else: iso-latin-1 code
143          * This allows for the whole unicode range to be encoded, including chars outside of
144          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
145          * characters which should never happen anyway (and still work, but take 3 bytes).
146          *
147          * @param character the character code.
148          * @return the size in binary encoded-form, either 1 or 3 bytes.
149          */
getCharSize(final int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap)150         static int getCharSize(final int character,
151                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
152             // See char encoding in FusionDictionary.java
153             if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1;
154             if (FormatSpec.INVALID_CHARACTER == character) return 1;
155             return 3;
156         }
157 
158         /**
159          * Compute the byte size of a character array.
160          */
getCharArraySize(final int[] chars, final HashMap<Integer, Integer> codePointToOneByteCodeMap)161         static int getCharArraySize(final int[] chars,
162                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
163             int size = 0;
164             for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap);
165             return size;
166         }
167 
168         /**
169          * Writes a char array to a byte buffer.
170          *
171          * @param codePoints the code point array to write.
172          * @param buffer the byte buffer to write to.
173          * @param fromIndex the index in buffer to write the character array to.
174          * @param codePointToOneByteCodeMap the map to convert the code point.
175          * @return the index after the last character.
176          */
writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex, final HashMap<Integer, Integer> codePointToOneByteCodeMap)177         static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex,
178                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
179             int index = fromIndex;
180             for (int codePoint : codePoints) {
181                 if (codePointToOneByteCodeMap != null) {
182                     if (codePointToOneByteCodeMap.containsKey(codePoint)) {
183                         // Convert code points
184                         codePoint = codePointToOneByteCodeMap.get(codePoint);
185                     }
186                 }
187                 if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
188                     buffer[index++] = (byte)codePoint;
189                 } else {
190                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
191                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
192                     buffer[index++] = (byte)(0xFF & codePoint);
193                 }
194             }
195             return index;
196         }
197 
198         /**
199          * Writes a string with our character format to a byte buffer.
200          *
201          * This will also write the terminator byte.
202          *
203          * @param buffer the byte buffer to write to.
204          * @param origin the offset to write from.
205          * @param word the string to write.
206          * @return the size written, in bytes.
207          */
writeString(final byte[] buffer, final int origin, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap)208         static int writeString(final byte[] buffer, final int origin, final String word,
209                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
210             final int length = word.length();
211             int index = origin;
212             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
213                 int codePoint = word.codePointAt(i);
214                 if (codePointToOneByteCodeMap != null) {
215                     if (codePointToOneByteCodeMap.containsKey(codePoint)) {
216                         // Convert code points
217                         codePoint = codePointToOneByteCodeMap.get(codePoint);
218                     }
219                 }
220                 if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
221                     buffer[index++] = (byte)codePoint;
222                 } else {
223                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
224                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
225                     buffer[index++] = (byte)(0xFF & codePoint);
226                 }
227             }
228             buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
229             return index - origin;
230         }
231 
232         /**
233          * Writes a string with our character format to an OutputStream.
234          *
235          * This will also write the terminator byte.
236          *
237          * @param stream the OutputStream to write to.
238          * @param word the string to write.
239          * @return the size written, in bytes.
240          */
writeString(final OutputStream stream, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap)241         static int writeString(final OutputStream stream, final String word,
242                 final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException {
243             final int length = word.length();
244             int written = 0;
245             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
246                 final int codePoint = word.codePointAt(i);
247                 final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap);
248                 if (1 == charSize) {
249                     stream.write((byte) codePoint);
250                 } else {
251                     stream.write((byte) (0xFF & (codePoint >> 16)));
252                     stream.write((byte) (0xFF & (codePoint >> 8)));
253                     stream.write((byte) (0xFF & codePoint));
254                 }
255                 written += charSize;
256             }
257             stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
258             written += FormatSpec.PTNODE_TERMINATOR_SIZE;
259             return written;
260         }
261 
262         /**
263          * Reads a string from a DictBuffer. This is the converse of the above method.
264          */
readString(final DictBuffer dictBuffer)265         static String readString(final DictBuffer dictBuffer) {
266             final StringBuilder s = new StringBuilder();
267             int character = readChar(dictBuffer);
268             while (character != FormatSpec.INVALID_CHARACTER) {
269                 s.appendCodePoint(character);
270                 character = readChar(dictBuffer);
271             }
272             return s.toString();
273         }
274 
275         /**
276          * Reads a character from the buffer.
277          *
278          * This follows the character format documented earlier in this source file.
279          *
280          * @param dictBuffer the buffer, positioned over an encoded character.
281          * @return the character code.
282          */
readChar(final DictBuffer dictBuffer)283         static int readChar(final DictBuffer dictBuffer) {
284             int character = dictBuffer.readUnsignedByte();
285             if (!fitsOnOneByte(character, null)) {
286                 if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
287                     return FormatSpec.INVALID_CHARACTER;
288                 }
289                 character <<= 16;
290                 character += dictBuffer.readUnsignedShort();
291             }
292             return character;
293         }
294     }
295 
296     /**
297      * Reads and returns the PtNode count out of a buffer and forwards the pointer.
298      */
readPtNodeCount(final DictBuffer dictBuffer)299     /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
300         final int msb = dictBuffer.readUnsignedByte();
301         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
302             return msb;
303         }
304         return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
305                 + dictBuffer.readUnsignedByte();
306     }
307 
308     /**
309      * Finds, as a string, the word at the position passed as an argument.
310      *
311      * @param dictDecoder the dict decoder.
312      * @param headerSize the size of the header.
313      * @param pos the position to seek.
314      * @return the word with its frequency, as a weighted string.
315      */
316     @UsedForTesting
getWordAtPosition(final DictDecoder dictDecoder, final int headerSize, final int pos)317     /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
318             final int headerSize, final int pos) {
319         final WeightedString result;
320         final int originalPos = dictDecoder.getPosition();
321         dictDecoder.setPosition(pos);
322         result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
323         dictDecoder.setPosition(originalPos);
324         return result;
325     }
326 
getWordAtPositionWithoutParentAddress( final DictDecoder dictDecoder, final int headerSize, final int pos)327     private static WeightedString getWordAtPositionWithoutParentAddress(
328             final DictDecoder dictDecoder, final int headerSize, final int pos) {
329         dictDecoder.setPosition(headerSize);
330         final int count = dictDecoder.readPtNodeCount();
331         int groupPos = dictDecoder.getPosition();
332         final StringBuilder builder = new StringBuilder();
333         WeightedString result = null;
334 
335         PtNodeInfo last = null;
336         for (int i = count - 1; i >= 0; --i) {
337             PtNodeInfo info = dictDecoder.readPtNode(groupPos);
338             groupPos = info.mEndAddress;
339             if (info.mOriginalAddress == pos) {
340                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
341                 result = new WeightedString(builder.toString(), info.mProbabilityInfo);
342                 break; // and return
343             }
344             if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
345                 if (info.mChildrenAddress > pos) {
346                     if (null == last) continue;
347                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
348                     dictDecoder.setPosition(last.mChildrenAddress);
349                     i = dictDecoder.readPtNodeCount();
350                     groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
351                     last = null;
352                     continue;
353                 }
354                 last = info;
355             }
356             if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
357                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
358                 dictDecoder.setPosition(last.mChildrenAddress);
359                 i = dictDecoder.readPtNodeCount();
360                 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
361                 last = null;
362                 continue;
363             }
364         }
365         return result;
366     }
367 
368     /**
369      * Helper method that brutally decodes a header from a byte array.
370      *
371      * @param headerBuffer a buffer containing the bytes of the header.
372      * @return a hashmap of the attributes stored in the header
373      */
374     @Nonnull
decodeHeaderAttributes(@onnull final byte[] headerBuffer)375     public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer)
376             throws UnsupportedFormatException {
377         final StringBuilder sb = new StringBuilder();
378         final LinkedList<String> keyValues = new LinkedList<>();
379         int index = 0;
380         while (index < headerBuffer.length) {
381             if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) {
382                 keyValues.add(sb.toString());
383                 sb.setLength(0);
384             } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF,
385                     null /* codePointTable */)) {
386                 sb.appendCodePoint(headerBuffer[index] & 0xFF);
387             } else {
388                 sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16)
389                         + ((headerBuffer[index + 1] & 0xFF) << 8)
390                         + (headerBuffer[index + 2] & 0xFF));
391                 index += 2;
392             }
393             index += 1;
394         }
395         if ((keyValues.size() & 1) != 0) {
396             throw new UnsupportedFormatException("Odd number of attributes");
397         }
398         final HashMap<String, String> attributes = new HashMap<>();
399         for (int i = 0; i < keyValues.size(); i += 2) {
400             attributes.put(keyValues.get(i), keyValues.get(i + 1));
401         }
402         return attributes;
403     }
404 
405     /**
406      * Helper method to pass a file name instead of a File object to isBinaryDictionary.
407      */
isBinaryDictionary(final String filename)408     public static boolean isBinaryDictionary(final String filename) {
409         final File file = new File(filename);
410         return isBinaryDictionary(file);
411     }
412 
413     /**
414      * Basic test to find out whether the file is a binary dictionary or not.
415      *
416      * @param file The file to test.
417      * @return true if it's a binary dictionary, false otherwise
418      */
isBinaryDictionary(final File file)419     public static boolean isBinaryDictionary(final File file) {
420         final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
421         if (dictDecoder == null) {
422             return false;
423         }
424         return dictDecoder.hasValidRawBinaryDictionary();
425     }
426 }
427