1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 21 import java.io.File; 22 import java.io.IOException; 23 import java.io.OutputStream; 24 import java.nio.ByteBuffer; 25 26 /** 27 * Decodes binary files for a FusionDictionary. 28 * 29 * All the methods in this class are static. 30 * 31 * TODO: Move this file to makedict/internal. 32 * TODO: Rename this class to DictDecoderUtils. 33 */ 34 public final class BinaryDictDecoderUtils { BinaryDictDecoderUtils()35 private BinaryDictDecoderUtils() { 36 // This utility class is not publicly instantiable. 37 } 38 39 @UsedForTesting 40 public interface DictBuffer { readUnsignedByte()41 public int readUnsignedByte(); readUnsignedShort()42 public int readUnsignedShort(); readUnsignedInt24()43 public int readUnsignedInt24(); readInt()44 public int readInt(); position()45 public int position(); position(int newPosition)46 public void position(int newPosition); 47 @UsedForTesting put(final byte b)48 public void put(final byte b); limit()49 public int limit(); 50 @UsedForTesting capacity()51 public int capacity(); 52 } 53 54 public static final class ByteBufferDictBuffer implements DictBuffer { 55 private ByteBuffer mBuffer; 56 ByteBufferDictBuffer(final ByteBuffer buffer)57 public ByteBufferDictBuffer(final ByteBuffer buffer) { 58 mBuffer = buffer; 59 } 60 61 @Override readUnsignedByte()62 public int readUnsignedByte() { 63 return mBuffer.get() & 0xFF; 64 } 65 66 @Override readUnsignedShort()67 public int readUnsignedShort() { 68 return mBuffer.getShort() & 0xFFFF; 69 } 70 71 @Override readUnsignedInt24()72 public int readUnsignedInt24() { 73 final int retval = readUnsignedByte(); 74 return (retval << 16) + readUnsignedShort(); 75 } 76 77 @Override readInt()78 public int readInt() { 79 return mBuffer.getInt(); 80 } 81 82 @Override position()83 public int position() { 84 return mBuffer.position(); 85 } 86 87 @Override position(int newPos)88 public void position(int newPos) { 89 mBuffer.position(newPos); 90 } 91 92 @Override put(final byte b)93 public void put(final byte b) { 94 mBuffer.put(b); 95 } 96 97 @Override limit()98 public int limit() { 99 return mBuffer.limit(); 100 } 101 102 @Override capacity()103 public int capacity() { 104 return mBuffer.capacity(); 105 } 106 } 107 108 /** 109 * A class grouping utility function for our specific character encoding. 110 */ 111 static final class CharEncoding { 112 private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 113 private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 114 115 /** 116 * Helper method to find out whether this code fits on one byte 117 */ fitsOnOneByte(final int character)118 private static boolean fitsOnOneByte(final int character) { 119 return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE 120 && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 121 } 122 123 /** 124 * Compute the size of a character given its character code. 125 * 126 * Char format is: 127 * 1 byte = bbbbbbbb match 128 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 129 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 130 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 131 * 00011111 would be outside unicode. 132 * else: iso-latin-1 code 133 * This allows for the whole unicode range to be encoded, including chars outside of 134 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 135 * characters which should never happen anyway (and still work, but take 3 bytes). 136 * 137 * @param character the character code. 138 * @return the size in binary encoded-form, either 1 or 3 bytes. 139 */ getCharSize(final int character)140 static int getCharSize(final int character) { 141 // See char encoding in FusionDictionary.java 142 if (fitsOnOneByte(character)) return 1; 143 if (FormatSpec.INVALID_CHARACTER == character) return 1; 144 return 3; 145 } 146 147 /** 148 * Compute the byte size of a character array. 149 */ getCharArraySize(final int[] chars)150 static int getCharArraySize(final int[] chars) { 151 int size = 0; 152 for (int character : chars) size += getCharSize(character); 153 return size; 154 } 155 156 /** 157 * Writes a char array to a byte buffer. 158 * 159 * @param codePoints the code point array to write. 160 * @param buffer the byte buffer to write to. 161 * @param index the index in buffer to write the character array to. 162 * @return the index after the last character. 163 */ writeCharArray(final int[] codePoints, final byte[] buffer, int index)164 static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { 165 for (int codePoint : codePoints) { 166 if (1 == getCharSize(codePoint)) { 167 buffer[index++] = (byte)codePoint; 168 } else { 169 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 170 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 171 buffer[index++] = (byte)(0xFF & codePoint); 172 } 173 } 174 return index; 175 } 176 177 /** 178 * Writes a string with our character format to a byte buffer. 179 * 180 * This will also write the terminator byte. 181 * 182 * @param buffer the byte buffer to write to. 183 * @param origin the offset to write from. 184 * @param word the string to write. 185 * @return the size written, in bytes. 186 */ writeString(final byte[] buffer, final int origin, final String word)187 static int writeString(final byte[] buffer, final int origin, final String word) { 188 final int length = word.length(); 189 int index = origin; 190 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 191 final int codePoint = word.codePointAt(i); 192 if (1 == getCharSize(codePoint)) { 193 buffer[index++] = (byte)codePoint; 194 } else { 195 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 196 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 197 buffer[index++] = (byte)(0xFF & codePoint); 198 } 199 } 200 buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; 201 return index - origin; 202 } 203 204 /** 205 * Writes a string with our character format to an OutputStream. 206 * 207 * This will also write the terminator byte. 208 * 209 * @param stream the OutputStream to write to. 210 * @param word the string to write. 211 * @return the size written, in bytes. 212 */ writeString(final OutputStream stream, final String word)213 static int writeString(final OutputStream stream, final String word) throws IOException { 214 final int length = word.length(); 215 int written = 0; 216 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 217 final int codePoint = word.codePointAt(i); 218 final int charSize = getCharSize(codePoint); 219 if (1 == charSize) { 220 stream.write((byte) codePoint); 221 } else { 222 stream.write((byte) (0xFF & (codePoint >> 16))); 223 stream.write((byte) (0xFF & (codePoint >> 8))); 224 stream.write((byte) (0xFF & codePoint)); 225 } 226 written += charSize; 227 } 228 stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); 229 written += FormatSpec.PTNODE_TERMINATOR_SIZE; 230 return written; 231 } 232 233 /** 234 * Reads a string from a DictBuffer. This is the converse of the above method. 235 */ readString(final DictBuffer dictBuffer)236 static String readString(final DictBuffer dictBuffer) { 237 final StringBuilder s = new StringBuilder(); 238 int character = readChar(dictBuffer); 239 while (character != FormatSpec.INVALID_CHARACTER) { 240 s.appendCodePoint(character); 241 character = readChar(dictBuffer); 242 } 243 return s.toString(); 244 } 245 246 /** 247 * Reads a character from the buffer. 248 * 249 * This follows the character format documented earlier in this source file. 250 * 251 * @param dictBuffer the buffer, positioned over an encoded character. 252 * @return the character code. 253 */ readChar(final DictBuffer dictBuffer)254 static int readChar(final DictBuffer dictBuffer) { 255 int character = dictBuffer.readUnsignedByte(); 256 if (!fitsOnOneByte(character)) { 257 if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { 258 return FormatSpec.INVALID_CHARACTER; 259 } 260 character <<= 16; 261 character += dictBuffer.readUnsignedShort(); 262 } 263 return character; 264 } 265 } 266 267 /** 268 * Reads and returns the PtNode count out of a buffer and forwards the pointer. 269 */ readPtNodeCount(final DictBuffer dictBuffer)270 /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { 271 final int msb = dictBuffer.readUnsignedByte(); 272 if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { 273 return msb; 274 } else { 275 return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) 276 + dictBuffer.readUnsignedByte(); 277 } 278 } 279 280 /** 281 * Finds, as a string, the word at the position passed as an argument. 282 * 283 * @param dictDecoder the dict decoder. 284 * @param headerSize the size of the header. 285 * @param pos the position to seek. 286 * @return the word with its frequency, as a weighted string. 287 */ 288 @UsedForTesting getWordAtPosition(final DictDecoder dictDecoder, final int headerSize, final int pos)289 /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, 290 final int headerSize, final int pos) { 291 final WeightedString result; 292 final int originalPos = dictDecoder.getPosition(); 293 dictDecoder.setPosition(pos); 294 result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); 295 dictDecoder.setPosition(originalPos); 296 return result; 297 } 298 getWordAtPositionWithoutParentAddress( final DictDecoder dictDecoder, final int headerSize, final int pos)299 private static WeightedString getWordAtPositionWithoutParentAddress( 300 final DictDecoder dictDecoder, final int headerSize, final int pos) { 301 dictDecoder.setPosition(headerSize); 302 final int count = dictDecoder.readPtNodeCount(); 303 int groupPos = dictDecoder.getPosition(); 304 final StringBuilder builder = new StringBuilder(); 305 WeightedString result = null; 306 307 PtNodeInfo last = null; 308 for (int i = count - 1; i >= 0; --i) { 309 PtNodeInfo info = dictDecoder.readPtNode(groupPos); 310 groupPos = info.mEndAddress; 311 if (info.mOriginalAddress == pos) { 312 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 313 result = new WeightedString(builder.toString(), info.mProbabilityInfo); 314 break; // and return 315 } 316 if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { 317 if (info.mChildrenAddress > pos) { 318 if (null == last) continue; 319 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 320 dictDecoder.setPosition(last.mChildrenAddress); 321 i = dictDecoder.readPtNodeCount(); 322 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); 323 last = null; 324 continue; 325 } 326 last = info; 327 } 328 if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { 329 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 330 dictDecoder.setPosition(last.mChildrenAddress); 331 i = dictDecoder.readPtNodeCount(); 332 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); 333 last = null; 334 continue; 335 } 336 } 337 return result; 338 } 339 340 /** 341 * Helper method to pass a file name instead of a File object to isBinaryDictionary. 342 */ isBinaryDictionary(final String filename)343 public static boolean isBinaryDictionary(final String filename) { 344 final File file = new File(filename); 345 return isBinaryDictionary(file); 346 } 347 348 /** 349 * Basic test to find out whether the file is a binary dictionary or not. 350 * 351 * @param file The file to test. 352 * @return true if it's a binary dictionary, false otherwise 353 */ isBinaryDictionary(final File file)354 public static boolean isBinaryDictionary(final File file) { 355 final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); 356 if (dictDecoder == null) { 357 return false; 358 } 359 return dictDecoder.hasValidRawBinaryDictionary(); 360 } 361 } 362