1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import android.test.AndroidTestCase; 20 import android.test.suitebuilder.annotation.LargeTest; 21 import android.text.TextUtils; 22 import android.util.Pair; 23 24 import com.android.inputmethod.latin.NgramContext.WordInfo; 25 import com.android.inputmethod.latin.common.CodePointUtils; 26 import com.android.inputmethod.latin.common.FileUtils; 27 import com.android.inputmethod.latin.makedict.DictionaryHeader; 28 import com.android.inputmethod.latin.makedict.FormatSpec; 29 import com.android.inputmethod.latin.makedict.WeightedString; 30 import com.android.inputmethod.latin.makedict.WordProperty; 31 import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; 32 33 import java.io.File; 34 import java.io.IOException; 35 import java.util.ArrayList; 36 import java.util.HashMap; 37 import java.util.HashSet; 38 import java.util.Locale; 39 import java.util.Random; 40 41 @LargeTest 42 public class BinaryDictionaryTests extends AndroidTestCase { 43 private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; 44 private static final String TEST_LOCALE = "test"; 45 private static final String DICTIONARY_ID = "TestBinaryDictionary"; 46 47 private HashSet<File> mDictFilesToBeDeleted = new HashSet<>(); 48 49 @Override setUp()50 protected void setUp() throws Exception { 51 super.setUp(); 52 mDictFilesToBeDeleted.clear(); 53 } 54 55 @Override tearDown()56 protected void tearDown() throws Exception { 57 for (final File dictFile : mDictFilesToBeDeleted) { 58 dictFile.delete(); 59 } 60 mDictFilesToBeDeleted.clear(); 61 super.tearDown(); 62 } 63 createEmptyDictionaryAndGetFile(final int formatVersion)64 private File createEmptyDictionaryAndGetFile(final int formatVersion) { 65 return createEmptyDictionaryWithAttributesAndGetFile(formatVersion, 66 new HashMap<String, String>()); 67 } 68 createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)69 private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, 70 final HashMap<String, String> attributeMap) { 71 try { 72 final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, 73 attributeMap); 74 mDictFilesToBeDeleted.add(dictFile); 75 return dictFile; 76 } catch (final IOException e) { 77 fail(e.toString()); 78 } 79 return null; 80 } 81 createEmptyVer4DictionaryAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)82 private File createEmptyVer4DictionaryAndGetFile(final int formatVersion, 83 final HashMap<String, String> attributeMap) throws IOException { 84 final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, 85 getContext().getCacheDir()); 86 file.delete(); 87 file.mkdir(); 88 if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, 89 Locale.ENGLISH, attributeMap)) { 90 return file; 91 } 92 throw new IOException("Empty dictionary " + file.getAbsolutePath() 93 + " cannot be created. Format version: " + formatVersion); 94 } 95 getBinaryDictionary(final File dictFile)96 private static BinaryDictionary getBinaryDictionary(final File dictFile) { 97 return new BinaryDictionary(dictFile.getAbsolutePath(), 98 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 99 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 100 } 101 getEmptyBinaryDictionary(final int formatVersion)102 private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) { 103 final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); 104 return new BinaryDictionary(dictFile.getAbsolutePath(), 105 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 106 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 107 } 108 testIsValidDictionary()109 public void testIsValidDictionary() { 110 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 111 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 112 assertTrue("binaryDictionary must be valid for existing valid dictionary file.", 113 binaryDictionary.isValidDictionary()); 114 binaryDictionary.close(); 115 assertFalse("binaryDictionary must be invalid after closing.", 116 binaryDictionary.isValidDictionary()); 117 FileUtils.deleteRecursively(dictFile); 118 binaryDictionary = getBinaryDictionary(dictFile); 119 assertFalse("binaryDictionary must be invalid for not existing dictionary file.", 120 binaryDictionary.isValidDictionary()); 121 binaryDictionary.close(); 122 } 123 testConstructingDictionaryOnMemory()124 public void testConstructingDictionaryOnMemory() { 125 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 126 FileUtils.deleteRecursively(dictFile); 127 assertFalse(dictFile.exists()); 128 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 129 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, 130 FormatSpec.VERSION403, new HashMap<String, String>()); 131 assertTrue(binaryDictionary.isValidDictionary()); 132 assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); 133 final int probability = 100; 134 addUnigramWord(binaryDictionary, "word", probability); 135 assertEquals(probability, binaryDictionary.getFrequency("word")); 136 assertFalse(dictFile.exists()); 137 binaryDictionary.flush(); 138 assertTrue(dictFile.exists()); 139 assertTrue(binaryDictionary.isValidDictionary()); 140 assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); 141 assertEquals(probability, binaryDictionary.getFrequency("word")); 142 binaryDictionary.close(); 143 } 144 testAddTooLongWord()145 public void testAddTooLongWord() { 146 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 147 final StringBuffer stringBuilder = new StringBuffer(); 148 for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) { 149 stringBuilder.append('a'); 150 } 151 final String validLongWord = stringBuilder.toString(); 152 stringBuilder.append('a'); 153 final String invalidLongWord = stringBuilder.toString(); 154 final int probability = 100; 155 addUnigramWord(binaryDictionary, "aaa", probability); 156 addUnigramWord(binaryDictionary, validLongWord, probability); 157 addUnigramWord(binaryDictionary, invalidLongWord, probability); 158 // Too long short cut. 159 binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */, 160 false /* isNotAWord */, false /* isPossiblyOffensive */, 161 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 162 addUnigramWord(binaryDictionary, "abc", probability); 163 final int updatedProbability = 200; 164 // Update. 165 addUnigramWord(binaryDictionary, validLongWord, updatedProbability); 166 addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); 167 addUnigramWord(binaryDictionary, "abc", updatedProbability); 168 169 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 170 assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); 171 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord)); 172 assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); 173 } 174 addUnigramWord(final BinaryDictionary binaryDictionary, final String word, final int probability)175 private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, 176 final int probability) { 177 binaryDictionary.addUnigramEntry(word, probability, 178 false /* isBeginningOfSentence */, false /* isNotAWord */, 179 false /* isPossiblyOffensive */, 180 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 181 } 182 addBigramWords(final BinaryDictionary binaryDictionary, final String word0, final String word1, final int probability)183 private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, 184 final String word1, final int probability) { 185 binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability, 186 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 187 } 188 addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2, final int probability)189 private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, 190 final String word1, final String word2, final int probability) { 191 binaryDictionary.addNgramEntry( 192 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2, 193 probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 194 } 195 isValidBigram(final BinaryDictionary binaryDictionary, final String word0, final String word1)196 private static boolean isValidBigram(final BinaryDictionary binaryDictionary, 197 final String word0, final String word1) { 198 return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1); 199 } 200 getBigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1)201 private static int getBigramProbability(final BinaryDictionary binaryDictionary, 202 final String word0, final String word1) { 203 return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1); 204 } 205 getTrigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2)206 private static int getTrigramProbability(final BinaryDictionary binaryDictionary, 207 final String word0, final String word1, final String word2) { 208 return binaryDictionary.getNgramProbability( 209 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2); 210 } 211 testAddUnigramWord()212 public void testAddUnigramWord() { 213 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 214 final int probability = 100; 215 addUnigramWord(binaryDictionary, "aaa", probability); 216 // Reallocate and create. 217 addUnigramWord(binaryDictionary, "aab", probability); 218 // Insert into children. 219 addUnigramWord(binaryDictionary, "aac", probability); 220 // Make terminal. 221 addUnigramWord(binaryDictionary, "aa", probability); 222 // Create children. 223 addUnigramWord(binaryDictionary, "aaaa", probability); 224 // Reallocate and make termianl. 225 addUnigramWord(binaryDictionary, "a", probability); 226 227 final int updatedProbability = 200; 228 // Update. 229 addUnigramWord(binaryDictionary, "aaa", updatedProbability); 230 231 assertEquals(probability, binaryDictionary.getFrequency("aab")); 232 assertEquals(probability, binaryDictionary.getFrequency("aac")); 233 assertEquals(probability, binaryDictionary.getFrequency("aa")); 234 assertEquals(probability, binaryDictionary.getFrequency("aaaa")); 235 assertEquals(probability, binaryDictionary.getFrequency("a")); 236 assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); 237 } 238 testRandomlyAddUnigramWord()239 public void testRandomlyAddUnigramWord() { 240 final int wordCount = 1000; 241 final int codePointSetSize = 50; 242 final long seed = System.currentTimeMillis(); 243 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 244 245 final HashMap<String, Integer> probabilityMap = new HashMap<>(); 246 // Test a word that isn't contained within the dictionary. 247 final Random random = new Random(seed); 248 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 249 for (int i = 0; i < wordCount; ++i) { 250 final String word = CodePointUtils.generateWord(random, codePointSet); 251 probabilityMap.put(word, random.nextInt(0xFF)); 252 } 253 for (String word : probabilityMap.keySet()) { 254 addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); 255 } 256 for (String word : probabilityMap.keySet()) { 257 assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); 258 } 259 } 260 testAddBigramWords()261 public void testAddBigramWords() { 262 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 263 264 final int unigramProbability = 100; 265 final int bigramProbability = 150; 266 final int updatedBigramProbability = 200; 267 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 268 addUnigramWord(binaryDictionary, "abb", unigramProbability); 269 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 270 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 271 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 272 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 273 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 274 275 assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); 276 assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); 277 assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); 278 assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); 279 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 280 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 281 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 282 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 283 284 addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); 285 assertEquals(updatedBigramProbability, 286 getBigramProbability(binaryDictionary, "aaa", "abb")); 287 288 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 289 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 290 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 291 assertEquals(Dictionary.NOT_A_PROBABILITY, 292 getBigramProbability(binaryDictionary, "bcc", "aaa")); 293 assertEquals(Dictionary.NOT_A_PROBABILITY, 294 getBigramProbability(binaryDictionary, "bcc", "bbc")); 295 assertEquals(Dictionary.NOT_A_PROBABILITY, 296 getBigramProbability(binaryDictionary, "aaa", "aaa")); 297 298 // Testing bigram link. 299 addUnigramWord(binaryDictionary, "abcde", unigramProbability); 300 addUnigramWord(binaryDictionary, "fghij", unigramProbability); 301 addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); 302 addUnigramWord(binaryDictionary, "fgh", unigramProbability); 303 addUnigramWord(binaryDictionary, "abc", unigramProbability); 304 addUnigramWord(binaryDictionary, "f", unigramProbability); 305 306 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij")); 307 assertEquals(Dictionary.NOT_A_PROBABILITY, 308 getBigramProbability(binaryDictionary, "abcde", "fgh")); 309 addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); 310 assertEquals(updatedBigramProbability, 311 getBigramProbability(binaryDictionary, "abcde", "fghij")); 312 } 313 testRandomlyAddBigramWords()314 public void testRandomlyAddBigramWords() { 315 final int wordCount = 100; 316 final int bigramCount = 1000; 317 final int codePointSetSize = 50; 318 final long seed = System.currentTimeMillis(); 319 final Random random = new Random(seed); 320 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 321 322 final ArrayList<String> words = new ArrayList<>(); 323 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 324 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 325 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 326 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 327 328 for (int i = 0; i < wordCount; ++i) { 329 final String word = CodePointUtils.generateWord(random, codePointSet); 330 words.add(word); 331 final int unigramProbability = random.nextInt(0xFF); 332 unigramProbabilities.put(word, unigramProbability); 333 addUnigramWord(binaryDictionary, word, unigramProbability); 334 } 335 336 for (int i = 0; i < bigramCount; i++) { 337 final String word0 = words.get(random.nextInt(wordCount)); 338 final String word1 = words.get(random.nextInt(wordCount)); 339 if (TextUtils.equals(word0, word1)) { 340 continue; 341 } 342 final Pair<String, String> bigram = new Pair<>(word0, word1); 343 bigramWords.add(bigram); 344 final int unigramProbability = unigramProbabilities.get(word1); 345 final int bigramProbability = 346 unigramProbability + random.nextInt(0xFF - unigramProbability); 347 bigramProbabilities.put(bigram, bigramProbability); 348 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 349 } 350 351 for (final Pair<String, String> bigram : bigramWords) { 352 final int bigramProbability = bigramProbabilities.get(bigram); 353 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 354 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 355 assertEquals(bigramProbability, 356 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 357 } 358 } 359 testAddTrigramWords()360 public void testAddTrigramWords() { 361 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 362 final int unigramProbability = 100; 363 final int trigramProbability = 150; 364 final int updatedTrigramProbability = 200; 365 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 366 addUnigramWord(binaryDictionary, "abb", unigramProbability); 367 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 368 369 addBigramWords(binaryDictionary, "abb", "bcc", 10); 370 addBigramWords(binaryDictionary, "abb", "aaa", 10); 371 372 addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability); 373 addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability); 374 375 assertEquals(trigramProbability, 376 getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc")); 377 assertEquals(trigramProbability, 378 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); 379 assertFalse(isValidBigram(binaryDictionary, "aaa", "abb")); 380 381 addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability); 382 assertEquals(updatedTrigramProbability, 383 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); 384 } 385 testFlushDictionary()386 public void testFlushDictionary() { 387 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 388 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 389 390 final int probability = 100; 391 addUnigramWord(binaryDictionary, "aaa", probability); 392 addUnigramWord(binaryDictionary, "abcd", probability); 393 // Close without flushing. 394 binaryDictionary.close(); 395 396 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 397 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 398 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 399 400 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); 401 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); 402 403 addUnigramWord(binaryDictionary, "aaa", probability); 404 addUnigramWord(binaryDictionary, "abcd", probability); 405 binaryDictionary.flush(); 406 binaryDictionary.close(); 407 408 binaryDictionary = getBinaryDictionary(dictFile); 409 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 410 assertEquals(probability, binaryDictionary.getFrequency("abcd")); 411 addUnigramWord(binaryDictionary, "bcde", probability); 412 binaryDictionary.flush(); 413 binaryDictionary.close(); 414 415 binaryDictionary = getBinaryDictionary(dictFile); 416 assertEquals(probability, binaryDictionary.getFrequency("bcde")); 417 binaryDictionary.close(); 418 } 419 testFlushWithGCDictionary()420 public void testFlushWithGCDictionary() { 421 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 422 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 423 final int unigramProbability = 100; 424 final int bigramProbability = 150; 425 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 426 addUnigramWord(binaryDictionary, "abb", unigramProbability); 427 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 428 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 429 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 430 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 431 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 432 binaryDictionary.flushWithGC(); 433 binaryDictionary.close(); 434 435 binaryDictionary = getBinaryDictionary(dictFile); 436 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 437 assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); 438 assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); 439 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 440 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 441 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 442 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 443 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 444 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 445 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 446 binaryDictionary.flushWithGC(); 447 binaryDictionary.close(); 448 } 449 testAddBigramWordsAndFlashWithGC()450 public void testAddBigramWordsAndFlashWithGC() { 451 final int wordCount = 100; 452 final int bigramCount = 1000; 453 final int codePointSetSize = 30; 454 final long seed = System.currentTimeMillis(); 455 final Random random = new Random(seed); 456 457 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 458 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 459 460 final ArrayList<String> words = new ArrayList<>(); 461 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 462 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 463 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 464 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 465 466 for (int i = 0; i < wordCount; ++i) { 467 final String word = CodePointUtils.generateWord(random, codePointSet); 468 words.add(word); 469 final int unigramProbability = random.nextInt(0xFF); 470 unigramProbabilities.put(word, unigramProbability); 471 addUnigramWord(binaryDictionary, word, unigramProbability); 472 } 473 474 for (int i = 0; i < bigramCount; i++) { 475 final String word0 = words.get(random.nextInt(wordCount)); 476 final String word1 = words.get(random.nextInt(wordCount)); 477 if (TextUtils.equals(word0, word1)) { 478 continue; 479 } 480 final Pair<String, String> bigram = new Pair<>(word0, word1); 481 bigramWords.add(bigram); 482 final int unigramProbability = unigramProbabilities.get(word1); 483 final int bigramProbability = 484 unigramProbability + random.nextInt(0xFF - unigramProbability); 485 bigramProbabilities.put(bigram, bigramProbability); 486 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 487 } 488 489 binaryDictionary.flushWithGC(); 490 binaryDictionary.close(); 491 binaryDictionary = getBinaryDictionary(dictFile); 492 493 for (final Pair<String, String> bigram : bigramWords) { 494 final int bigramProbability = bigramProbabilities.get(bigram); 495 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 496 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 497 assertEquals(bigramProbability, 498 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 499 } 500 } 501 testRandomOperationsAndFlashWithGC()502 public void testRandomOperationsAndFlashWithGC() { 503 final int maxUnigramCount = 5000; 504 final int maxBigramCount = 10000; 505 final HashMap<String, String> attributeMap = new HashMap<>(); 506 attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); 507 attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); 508 509 final int flashWithGCIterationCount = 50; 510 final int operationCountInEachIteration = 200; 511 final int initialUnigramCount = 100; 512 final float addUnigramProb = 0.5f; 513 final float addBigramProb = 0.8f; 514 final int codePointSetSize = 30; 515 516 final long seed = System.currentTimeMillis(); 517 final Random random = new Random(seed); 518 final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, 519 attributeMap); 520 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 521 522 final ArrayList<String> words = new ArrayList<>(); 523 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 524 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 525 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 526 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 527 for (int i = 0; i < initialUnigramCount; ++i) { 528 final String word = CodePointUtils.generateWord(random, codePointSet); 529 words.add(word); 530 final int unigramProbability = random.nextInt(0xFF); 531 unigramProbabilities.put(word, unigramProbability); 532 addUnigramWord(binaryDictionary, word, unigramProbability); 533 } 534 binaryDictionary.flushWithGC(); 535 binaryDictionary.close(); 536 537 for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { 538 binaryDictionary = getBinaryDictionary(dictFile); 539 for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { 540 // Add unigram. 541 if (random.nextFloat() < addUnigramProb) { 542 final String word = CodePointUtils.generateWord(random, codePointSet); 543 words.add(word); 544 final int unigramProbability = random.nextInt(0xFF); 545 unigramProbabilities.put(word, unigramProbability); 546 addUnigramWord(binaryDictionary, word, unigramProbability); 547 } 548 // Add bigram. 549 if (random.nextFloat() < addBigramProb && words.size() > 2) { 550 final int word0Index = random.nextInt(words.size()); 551 int word1Index = random.nextInt(words.size() - 1); 552 if (word0Index <= word1Index) { 553 word1Index++; 554 } 555 final String word0 = words.get(word0Index); 556 final String word1 = words.get(word1Index); 557 if (TextUtils.equals(word0, word1)) { 558 continue; 559 } 560 final int unigramProbability = unigramProbabilities.get(word1); 561 final int bigramProbability = 562 unigramProbability + random.nextInt(0xFF - unigramProbability); 563 final Pair<String, String> bigram = new Pair<>(word0, word1); 564 bigramWords.add(bigram); 565 bigramProbabilities.put(bigram, bigramProbability); 566 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 567 } 568 } 569 570 // Test whether the all unigram operations are collectlly handled. 571 for (int i = 0; i < words.size(); i++) { 572 final String word = words.get(i); 573 final int unigramProbability = unigramProbabilities.get(word); 574 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 575 } 576 // Test whether the all bigram operations are collectlly handled. 577 for (int i = 0; i < bigramWords.size(); i++) { 578 final Pair<String, String> bigram = bigramWords.get(i); 579 final int probability; 580 if (bigramProbabilities.containsKey(bigram)) { 581 probability = bigramProbabilities.get(bigram); 582 } else { 583 probability = Dictionary.NOT_A_PROBABILITY; 584 } 585 586 assertEquals(probability, 587 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 588 assertEquals(probability != Dictionary.NOT_A_PROBABILITY, 589 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 590 } 591 binaryDictionary.flushWithGC(); 592 binaryDictionary.close(); 593 } 594 } 595 testAddManyUnigramsAndFlushWithGC()596 public void testAddManyUnigramsAndFlushWithGC() { 597 final int flashWithGCIterationCount = 3; 598 final int codePointSetSize = 50; 599 600 final long seed = System.currentTimeMillis(); 601 final Random random = new Random(seed); 602 603 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 604 605 final ArrayList<String> words = new ArrayList<>(); 606 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 607 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 608 609 BinaryDictionary binaryDictionary; 610 for (int i = 0; i < flashWithGCIterationCount; i++) { 611 binaryDictionary = getBinaryDictionary(dictFile); 612 while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 613 final String word = CodePointUtils.generateWord(random, codePointSet); 614 words.add(word); 615 final int unigramProbability = random.nextInt(0xFF); 616 unigramProbabilities.put(word, unigramProbability); 617 addUnigramWord(binaryDictionary, word, unigramProbability); 618 } 619 620 for (int j = 0; j < words.size(); j++) { 621 final String word = words.get(j); 622 final int unigramProbability = unigramProbabilities.get(word); 623 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 624 } 625 626 binaryDictionary.flushWithGC(); 627 binaryDictionary.close(); 628 } 629 } 630 testUnigramAndBigramCount()631 public void testUnigramAndBigramCount() { 632 final int maxUnigramCount = 5000; 633 final int maxBigramCount = 10000; 634 final HashMap<String, String> attributeMap = new HashMap<>(); 635 attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); 636 attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); 637 638 final int flashWithGCIterationCount = 10; 639 final int codePointSetSize = 50; 640 final int unigramCountPerIteration = 1000; 641 final int bigramCountPerIteration = 2000; 642 final long seed = System.currentTimeMillis(); 643 final Random random = new Random(seed); 644 final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, 645 attributeMap); 646 647 final ArrayList<String> words = new ArrayList<>(); 648 final HashSet<Pair<String, String>> bigrams = new HashSet<>(); 649 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 650 651 BinaryDictionary binaryDictionary; 652 for (int i = 0; i < flashWithGCIterationCount; i++) { 653 binaryDictionary = getBinaryDictionary(dictFile); 654 for (int j = 0; j < unigramCountPerIteration; j++) { 655 final String word = CodePointUtils.generateWord(random, codePointSet); 656 words.add(word); 657 final int unigramProbability = random.nextInt(0xFF); 658 addUnigramWord(binaryDictionary, word, unigramProbability); 659 } 660 for (int j = 0; j < bigramCountPerIteration; j++) { 661 final String word0 = words.get(random.nextInt(words.size())); 662 final String word1 = words.get(random.nextInt(words.size())); 663 if (TextUtils.equals(word0, word1)) { 664 continue; 665 } 666 bigrams.add(new Pair<>(word0, word1)); 667 final int bigramProbability = random.nextInt(0xF); 668 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 669 } 670 assertEquals(new HashSet<>(words).size(), Integer.parseInt( 671 binaryDictionary.getPropertyForGettingStats( 672 BinaryDictionary.UNIGRAM_COUNT_QUERY))); 673 assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( 674 binaryDictionary.getPropertyForGettingStats( 675 BinaryDictionary.BIGRAM_COUNT_QUERY))); 676 binaryDictionary.flushWithGC(); 677 assertEquals(new HashSet<>(words).size(), Integer.parseInt( 678 binaryDictionary.getPropertyForGettingStats( 679 BinaryDictionary.UNIGRAM_COUNT_QUERY))); 680 assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( 681 binaryDictionary.getPropertyForGettingStats( 682 BinaryDictionary.BIGRAM_COUNT_QUERY))); 683 binaryDictionary.close(); 684 } 685 } 686 testGetWordProperties()687 public void testGetWordProperties() { 688 final long seed = System.currentTimeMillis(); 689 final Random random = new Random(seed); 690 final int UNIGRAM_COUNT = 1000; 691 final int BIGRAM_COUNT = 1000; 692 final int codePointSetSize = 20; 693 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 694 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 695 final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 696 697 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", 698 false /* isBeginningOfSentence */); 699 assertFalse(invalidWordProperty.isValid()); 700 701 final ArrayList<String> words = new ArrayList<>(); 702 final HashMap<String, Integer> wordProbabilities = new HashMap<>(); 703 final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); 704 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 705 706 for (int i = 0; i < UNIGRAM_COUNT; i++) { 707 final String word = CodePointUtils.generateWord(random, codePointSet); 708 final int unigramProbability = random.nextInt(0xFF); 709 final boolean isNotAWord = random.nextBoolean(); 710 final boolean isPossiblyOffensive = random.nextBoolean(); 711 // TODO: Add tests for historical info. 712 binaryDictionary.addUnigramEntry(word, unigramProbability, 713 false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, 714 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 715 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 716 binaryDictionary.flushWithGC(); 717 } 718 words.add(word); 719 wordProbabilities.put(word, unigramProbability); 720 final WordProperty wordProperty = binaryDictionary.getWordProperty(word, 721 false /* isBeginningOfSentence */); 722 assertEquals(word, wordProperty.mWord); 723 assertTrue(wordProperty.isValid()); 724 assertEquals(isNotAWord, wordProperty.mIsNotAWord); 725 assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive); 726 assertEquals(false, wordProperty.mHasNgrams); 727 assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); 728 } 729 730 for (int i = 0; i < BIGRAM_COUNT; i++) { 731 final int word0Index = random.nextInt(wordProbabilities.size()); 732 final int word1Index = random.nextInt(wordProbabilities.size()); 733 if (word0Index == word1Index) { 734 continue; 735 } 736 final String word0 = words.get(word0Index); 737 final String word1 = words.get(word1Index); 738 final int unigramProbability = wordProbabilities.get(word1); 739 final int bigramProbability = 740 unigramProbability + random.nextInt(0xFF - unigramProbability); 741 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 742 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 743 binaryDictionary.flushWithGC(); 744 } 745 if (!bigrams.containsKey(word0)) { 746 final HashSet<String> bigramWord1s = new HashSet<>(); 747 bigrams.put(word0, bigramWord1s); 748 } 749 bigrams.get(word0).add(word1); 750 bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability); 751 } 752 753 for (int i = 0; i < words.size(); i++) { 754 final String word0 = words.get(i); 755 if (!bigrams.containsKey(word0)) { 756 continue; 757 } 758 final HashSet<String> bigramWord1s = bigrams.get(word0); 759 final WordProperty wordProperty = binaryDictionary.getWordProperty(word0, 760 false /* isBeginningOfSentence */); 761 assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size()); 762 // TODO: Support ngram. 763 for (final WeightedString bigramTarget : wordProperty.getBigrams()) { 764 final String word1 = bigramTarget.mWord; 765 assertTrue(bigramWord1s.contains(word1)); 766 final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1)); 767 assertEquals(bigramProbability, bigramTarget.getProbability()); 768 } 769 } 770 } 771 testIterateAllWords()772 public void testIterateAllWords() { 773 final long seed = System.currentTimeMillis(); 774 final Random random = new Random(seed); 775 final int UNIGRAM_COUNT = 1000; 776 final int BIGRAM_COUNT = 1000; 777 final int codePointSetSize = 20; 778 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 779 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 780 781 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", 782 false /* isBeginningOfSentence */); 783 assertFalse(invalidWordProperty.isValid()); 784 785 final ArrayList<String> words = new ArrayList<>(); 786 final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>(); 787 final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); 788 final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater = 789 new HashMap<>(); 790 791 for (int i = 0; i < UNIGRAM_COUNT; i++) { 792 final String word = CodePointUtils.generateWord(random, codePointSet); 793 final int unigramProbability = random.nextInt(0xFF); 794 addUnigramWord(binaryDictionary, word, unigramProbability); 795 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 796 binaryDictionary.flushWithGC(); 797 } 798 words.add(word); 799 wordProbabilitiesToCheckLater.put(word, unigramProbability); 800 } 801 802 for (int i = 0; i < BIGRAM_COUNT; i++) { 803 final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 804 final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 805 if (word0Index == word1Index) { 806 continue; 807 } 808 final String word0 = words.get(word0Index); 809 final String word1 = words.get(word1Index); 810 final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); 811 final int bigramProbability = 812 unigramProbability + random.nextInt(0xFF - unigramProbability); 813 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 814 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 815 binaryDictionary.flushWithGC(); 816 } 817 if (!bigrams.containsKey(word0)) { 818 final HashSet<String> bigramWord1s = new HashSet<>(); 819 bigrams.put(word0, bigramWord1s); 820 } 821 bigrams.get(word0).add(word1); 822 bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability); 823 } 824 825 final HashSet<String> wordSet = new HashSet<>(words); 826 final HashSet<Pair<String, String>> bigramSet = 827 new HashSet<>(bigramProbabilitiesToCheckLater.keySet()); 828 int token = 0; 829 do { 830 final BinaryDictionary.GetNextWordPropertyResult result = 831 binaryDictionary.getNextWordProperty(token); 832 final WordProperty wordProperty = result.mWordProperty; 833 final String word0 = wordProperty.mWord; 834 assertEquals((int)wordProbabilitiesToCheckLater.get(word0), 835 wordProperty.mProbabilityInfo.mProbability); 836 wordSet.remove(word0); 837 final HashSet<String> bigramWord1s = bigrams.get(word0); 838 // TODO: Support ngram. 839 if (wordProperty.mHasNgrams) { 840 for (final WeightedString bigramTarget : wordProperty.getBigrams()) { 841 final String word1 = bigramTarget.mWord; 842 assertTrue(bigramWord1s.contains(word1)); 843 final Pair<String, String> bigram = new Pair<>(word0, word1); 844 final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram); 845 assertEquals(bigramProbability, bigramTarget.getProbability()); 846 bigramSet.remove(bigram); 847 } 848 } 849 token = result.mNextToken; 850 } while (token != 0); 851 assertTrue(wordSet.isEmpty()); 852 assertTrue(bigramSet.isEmpty()); 853 } 854 testPossiblyOffensiveAttributeMaintained()855 public void testPossiblyOffensiveAttributeMaintained() { 856 final BinaryDictionary binaryDictionary = 857 getEmptyBinaryDictionary(FormatSpec.VERSION403); 858 binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0); 859 WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false); 860 assertEquals(true, wordProperty.mIsPossiblyOffensive); 861 } 862 testBeginningOfSentence()863 public void testBeginningOfSentence() { 864 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 865 final int dummyProbability = 0; 866 final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE; 867 final int bigramProbability = 200; 868 addUnigramWord(binaryDictionary, "aaa", dummyProbability); 869 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, 870 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 871 assertEquals(bigramProbability, 872 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); 873 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, 874 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 875 addUnigramWord(binaryDictionary, "bbb", dummyProbability); 876 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability, 877 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 878 binaryDictionary.flushWithGC(); 879 assertEquals(bigramProbability, 880 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); 881 assertEquals(bigramProbability, 882 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb")); 883 } 884 } 885