1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import static org.junit.Assert.assertEquals; 20 import static org.junit.Assert.assertFalse; 21 import static org.junit.Assert.assertTrue; 22 import static org.junit.Assert.fail; 23 24 import android.text.TextUtils; 25 import android.util.Pair; 26 27 import androidx.test.InstrumentationRegistry; 28 import androidx.test.filters.LargeTest; 29 import androidx.test.runner.AndroidJUnit4; 30 31 import com.android.inputmethod.latin.NgramContext.WordInfo; 32 import com.android.inputmethod.latin.common.CodePointUtils; 33 import com.android.inputmethod.latin.common.FileUtils; 34 import com.android.inputmethod.latin.makedict.DictionaryHeader; 35 import com.android.inputmethod.latin.makedict.FormatSpec; 36 import com.android.inputmethod.latin.makedict.WeightedString; 37 import com.android.inputmethod.latin.makedict.WordProperty; 38 import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; 39 40 import org.junit.After; 41 import org.junit.Before; 42 import org.junit.Test; 43 import org.junit.runner.RunWith; 44 45 import java.io.File; 46 import java.io.IOException; 47 import java.util.ArrayList; 48 import java.util.HashMap; 49 import java.util.HashSet; 50 import java.util.Locale; 51 import java.util.Random; 52 53 @LargeTest 54 @RunWith(AndroidJUnit4.class) 55 public class BinaryDictionaryTests { 56 private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; 57 private static final String TEST_LOCALE = "test"; 58 private static final String DICTIONARY_ID = "TestBinaryDictionary"; 59 60 private HashSet<File> mDictFilesToBeDeleted = new HashSet<>(); 61 62 @Before setUp()63 public void setUp() throws Exception { 64 mDictFilesToBeDeleted.clear(); 65 } 66 67 @After tearDown()68 public void tearDown() throws Exception { 69 for (final File dictFile : mDictFilesToBeDeleted) { 70 dictFile.delete(); 71 } 72 mDictFilesToBeDeleted.clear(); 73 } 74 createEmptyDictionaryAndGetFile(final int formatVersion)75 private File createEmptyDictionaryAndGetFile(final int formatVersion) { 76 return createEmptyDictionaryWithAttributesAndGetFile(formatVersion, 77 new HashMap<String, String>()); 78 } 79 createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)80 private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, 81 final HashMap<String, String> attributeMap) { 82 try { 83 final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, 84 attributeMap); 85 mDictFilesToBeDeleted.add(dictFile); 86 return dictFile; 87 } catch (final IOException e) { 88 fail(e.toString()); 89 } 90 return null; 91 } 92 createEmptyVer4DictionaryAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)93 private File createEmptyVer4DictionaryAndGetFile(final int formatVersion, 94 final HashMap<String, String> attributeMap) throws IOException { 95 final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, 96 InstrumentationRegistry.getTargetContext().getCacheDir()); 97 file.delete(); 98 file.mkdir(); 99 if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, 100 Locale.ENGLISH, attributeMap)) { 101 return file; 102 } 103 throw new IOException("Empty dictionary " + file.getAbsolutePath() 104 + " cannot be created. Format version: " + formatVersion); 105 } 106 getBinaryDictionary(final File dictFile)107 private static BinaryDictionary getBinaryDictionary(final File dictFile) { 108 return new BinaryDictionary(dictFile.getAbsolutePath(), 109 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 110 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 111 } 112 getEmptyBinaryDictionary(final int formatVersion)113 private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) { 114 final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); 115 return new BinaryDictionary(dictFile.getAbsolutePath(), 116 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 117 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 118 } 119 120 @Test testIsValidDictionary()121 public void testIsValidDictionary() { 122 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 123 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 124 assertTrue("binaryDictionary must be valid for existing valid dictionary file.", 125 binaryDictionary.isValidDictionary()); 126 binaryDictionary.close(); 127 assertFalse("binaryDictionary must be invalid after closing.", 128 binaryDictionary.isValidDictionary()); 129 FileUtils.deleteRecursively(dictFile); 130 binaryDictionary = getBinaryDictionary(dictFile); 131 assertFalse("binaryDictionary must be invalid for not existing dictionary file.", 132 binaryDictionary.isValidDictionary()); 133 binaryDictionary.close(); 134 } 135 136 @Test testConstructingDictionaryOnMemory()137 public void testConstructingDictionaryOnMemory() { 138 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 139 FileUtils.deleteRecursively(dictFile); 140 assertFalse(dictFile.exists()); 141 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 142 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, 143 FormatSpec.VERSION403, new HashMap<String, String>()); 144 assertTrue(binaryDictionary.isValidDictionary()); 145 assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); 146 final int probability = 100; 147 addUnigramWord(binaryDictionary, "word", probability); 148 assertEquals(probability, binaryDictionary.getFrequency("word")); 149 assertFalse(dictFile.exists()); 150 binaryDictionary.flush(); 151 assertTrue(dictFile.exists()); 152 assertTrue(binaryDictionary.isValidDictionary()); 153 assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); 154 assertEquals(probability, binaryDictionary.getFrequency("word")); 155 binaryDictionary.close(); 156 } 157 158 @Test testAddTooLongWord()159 public void testAddTooLongWord() { 160 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 161 final StringBuffer stringBuilder = new StringBuffer(); 162 for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) { 163 stringBuilder.append('a'); 164 } 165 final String validLongWord = stringBuilder.toString(); 166 stringBuilder.append('a'); 167 final String invalidLongWord = stringBuilder.toString(); 168 final int probability = 100; 169 addUnigramWord(binaryDictionary, "aaa", probability); 170 addUnigramWord(binaryDictionary, validLongWord, probability); 171 addUnigramWord(binaryDictionary, invalidLongWord, probability); 172 // Too long short cut. 173 binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */, 174 false /* isNotAWord */, false /* isPossiblyOffensive */, 175 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 176 addUnigramWord(binaryDictionary, "abc", probability); 177 final int updatedProbability = 200; 178 // Update. 179 addUnigramWord(binaryDictionary, validLongWord, updatedProbability); 180 addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); 181 addUnigramWord(binaryDictionary, "abc", updatedProbability); 182 183 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 184 assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); 185 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord)); 186 assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); 187 } 188 addUnigramWord(final BinaryDictionary binaryDictionary, final String word, final int probability)189 private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, 190 final int probability) { 191 binaryDictionary.addUnigramEntry(word, probability, 192 false /* isBeginningOfSentence */, false /* isNotAWord */, 193 false /* isPossiblyOffensive */, 194 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 195 } 196 addBigramWords(final BinaryDictionary binaryDictionary, final String word0, final String word1, final int probability)197 private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, 198 final String word1, final int probability) { 199 binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability, 200 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 201 } 202 addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2, final int probability)203 private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, 204 final String word1, final String word2, final int probability) { 205 binaryDictionary.addNgramEntry( 206 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2, 207 probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 208 } 209 isValidBigram(final BinaryDictionary binaryDictionary, final String word0, final String word1)210 private static boolean isValidBigram(final BinaryDictionary binaryDictionary, 211 final String word0, final String word1) { 212 return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1); 213 } 214 getBigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1)215 private static int getBigramProbability(final BinaryDictionary binaryDictionary, 216 final String word0, final String word1) { 217 return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1); 218 } 219 getTrigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2)220 private static int getTrigramProbability(final BinaryDictionary binaryDictionary, 221 final String word0, final String word1, final String word2) { 222 return binaryDictionary.getNgramProbability( 223 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2); 224 } 225 226 @Test testAddUnigramWord()227 public void testAddUnigramWord() { 228 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 229 final int probability = 100; 230 addUnigramWord(binaryDictionary, "aaa", probability); 231 // Reallocate and create. 232 addUnigramWord(binaryDictionary, "aab", probability); 233 // Insert into children. 234 addUnigramWord(binaryDictionary, "aac", probability); 235 // Make terminal. 236 addUnigramWord(binaryDictionary, "aa", probability); 237 // Create children. 238 addUnigramWord(binaryDictionary, "aaaa", probability); 239 // Reallocate and make termianl. 240 addUnigramWord(binaryDictionary, "a", probability); 241 242 final int updatedProbability = 200; 243 // Update. 244 addUnigramWord(binaryDictionary, "aaa", updatedProbability); 245 246 assertEquals(probability, binaryDictionary.getFrequency("aab")); 247 assertEquals(probability, binaryDictionary.getFrequency("aac")); 248 assertEquals(probability, binaryDictionary.getFrequency("aa")); 249 assertEquals(probability, binaryDictionary.getFrequency("aaaa")); 250 assertEquals(probability, binaryDictionary.getFrequency("a")); 251 assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); 252 } 253 254 @Test testRandomlyAddUnigramWord()255 public void testRandomlyAddUnigramWord() { 256 final int wordCount = 1000; 257 final int codePointSetSize = 50; 258 final long seed = System.currentTimeMillis(); 259 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 260 261 final HashMap<String, Integer> probabilityMap = new HashMap<>(); 262 // Test a word that isn't contained within the dictionary. 263 final Random random = new Random(seed); 264 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 265 for (int i = 0; i < wordCount; ++i) { 266 final String word = CodePointUtils.generateWord(random, codePointSet); 267 probabilityMap.put(word, random.nextInt(0xFF)); 268 } 269 for (String word : probabilityMap.keySet()) { 270 addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); 271 } 272 for (String word : probabilityMap.keySet()) { 273 assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); 274 } 275 } 276 277 @Test testAddBigramWords()278 public void testAddBigramWords() { 279 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 280 281 final int unigramProbability = 100; 282 final int bigramProbability = 150; 283 final int updatedBigramProbability = 200; 284 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 285 addUnigramWord(binaryDictionary, "abb", unigramProbability); 286 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 287 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 288 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 289 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 290 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 291 292 assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); 293 assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); 294 assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); 295 assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); 296 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 297 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 298 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 299 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 300 301 addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); 302 assertEquals(updatedBigramProbability, 303 getBigramProbability(binaryDictionary, "aaa", "abb")); 304 305 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 306 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 307 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 308 assertEquals(Dictionary.NOT_A_PROBABILITY, 309 getBigramProbability(binaryDictionary, "bcc", "aaa")); 310 assertEquals(Dictionary.NOT_A_PROBABILITY, 311 getBigramProbability(binaryDictionary, "bcc", "bbc")); 312 assertEquals(Dictionary.NOT_A_PROBABILITY, 313 getBigramProbability(binaryDictionary, "aaa", "aaa")); 314 315 // Testing bigram link. 316 addUnigramWord(binaryDictionary, "abcde", unigramProbability); 317 addUnigramWord(binaryDictionary, "fghij", unigramProbability); 318 addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); 319 addUnigramWord(binaryDictionary, "fgh", unigramProbability); 320 addUnigramWord(binaryDictionary, "abc", unigramProbability); 321 addUnigramWord(binaryDictionary, "f", unigramProbability); 322 323 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij")); 324 assertEquals(Dictionary.NOT_A_PROBABILITY, 325 getBigramProbability(binaryDictionary, "abcde", "fgh")); 326 addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); 327 assertEquals(updatedBigramProbability, 328 getBigramProbability(binaryDictionary, "abcde", "fghij")); 329 } 330 331 @Test testRandomlyAddBigramWords()332 public void testRandomlyAddBigramWords() { 333 final int wordCount = 100; 334 final int bigramCount = 1000; 335 final int codePointSetSize = 50; 336 final long seed = System.currentTimeMillis(); 337 final Random random = new Random(seed); 338 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 339 340 final ArrayList<String> words = new ArrayList<>(); 341 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 342 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 343 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 344 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 345 346 for (int i = 0; i < wordCount; ++i) { 347 final String word = CodePointUtils.generateWord(random, codePointSet); 348 words.add(word); 349 final int unigramProbability = random.nextInt(0xFF); 350 unigramProbabilities.put(word, unigramProbability); 351 addUnigramWord(binaryDictionary, word, unigramProbability); 352 } 353 354 for (int i = 0; i < bigramCount; i++) { 355 final String word0 = words.get(random.nextInt(wordCount)); 356 final String word1 = words.get(random.nextInt(wordCount)); 357 if (TextUtils.equals(word0, word1)) { 358 continue; 359 } 360 final Pair<String, String> bigram = new Pair<>(word0, word1); 361 bigramWords.add(bigram); 362 final int unigramProbability = unigramProbabilities.get(word1); 363 final int bigramProbability = 364 unigramProbability + random.nextInt(0xFF - unigramProbability); 365 bigramProbabilities.put(bigram, bigramProbability); 366 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 367 } 368 369 for (final Pair<String, String> bigram : bigramWords) { 370 final int bigramProbability = bigramProbabilities.get(bigram); 371 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 372 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 373 assertEquals(bigramProbability, 374 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 375 } 376 } 377 378 @Test testAddTrigramWords()379 public void testAddTrigramWords() { 380 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 381 final int unigramProbability = 100; 382 final int trigramProbability = 150; 383 final int updatedTrigramProbability = 200; 384 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 385 addUnigramWord(binaryDictionary, "abb", unigramProbability); 386 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 387 388 addBigramWords(binaryDictionary, "abb", "bcc", 10); 389 addBigramWords(binaryDictionary, "abb", "aaa", 10); 390 391 addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability); 392 addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability); 393 394 assertEquals(trigramProbability, 395 getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc")); 396 assertEquals(trigramProbability, 397 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); 398 assertFalse(isValidBigram(binaryDictionary, "aaa", "abb")); 399 400 addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability); 401 assertEquals(updatedTrigramProbability, 402 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); 403 } 404 405 @Test testFlushDictionary()406 public void testFlushDictionary() { 407 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 408 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 409 410 final int probability = 100; 411 addUnigramWord(binaryDictionary, "aaa", probability); 412 addUnigramWord(binaryDictionary, "abcd", probability); 413 // Close without flushing. 414 binaryDictionary.close(); 415 416 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 417 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 418 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 419 420 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); 421 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); 422 423 addUnigramWord(binaryDictionary, "aaa", probability); 424 addUnigramWord(binaryDictionary, "abcd", probability); 425 binaryDictionary.flush(); 426 binaryDictionary.close(); 427 428 binaryDictionary = getBinaryDictionary(dictFile); 429 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 430 assertEquals(probability, binaryDictionary.getFrequency("abcd")); 431 addUnigramWord(binaryDictionary, "bcde", probability); 432 binaryDictionary.flush(); 433 binaryDictionary.close(); 434 435 binaryDictionary = getBinaryDictionary(dictFile); 436 assertEquals(probability, binaryDictionary.getFrequency("bcde")); 437 binaryDictionary.close(); 438 } 439 440 @Test testFlushWithGCDictionary()441 public void testFlushWithGCDictionary() { 442 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 443 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 444 final int unigramProbability = 100; 445 final int bigramProbability = 150; 446 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 447 addUnigramWord(binaryDictionary, "abb", unigramProbability); 448 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 449 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 450 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 451 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 452 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 453 binaryDictionary.flushWithGC(); 454 binaryDictionary.close(); 455 456 binaryDictionary = getBinaryDictionary(dictFile); 457 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 458 assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); 459 assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); 460 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 461 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 462 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 463 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 464 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 465 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 466 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 467 binaryDictionary.flushWithGC(); 468 binaryDictionary.close(); 469 } 470 471 @Test testAddBigramWordsAndFlashWithGC()472 public void testAddBigramWordsAndFlashWithGC() { 473 final int wordCount = 100; 474 final int bigramCount = 1000; 475 final int codePointSetSize = 30; 476 final long seed = System.currentTimeMillis(); 477 final Random random = new Random(seed); 478 479 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 480 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 481 482 final ArrayList<String> words = new ArrayList<>(); 483 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 484 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 485 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 486 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 487 488 for (int i = 0; i < wordCount; ++i) { 489 final String word = CodePointUtils.generateWord(random, codePointSet); 490 words.add(word); 491 final int unigramProbability = random.nextInt(0xFF); 492 unigramProbabilities.put(word, unigramProbability); 493 addUnigramWord(binaryDictionary, word, unigramProbability); 494 } 495 496 for (int i = 0; i < bigramCount; i++) { 497 final String word0 = words.get(random.nextInt(wordCount)); 498 final String word1 = words.get(random.nextInt(wordCount)); 499 if (TextUtils.equals(word0, word1)) { 500 continue; 501 } 502 final Pair<String, String> bigram = new Pair<>(word0, word1); 503 bigramWords.add(bigram); 504 final int unigramProbability = unigramProbabilities.get(word1); 505 final int bigramProbability = 506 unigramProbability + random.nextInt(0xFF - unigramProbability); 507 bigramProbabilities.put(bigram, bigramProbability); 508 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 509 } 510 511 binaryDictionary.flushWithGC(); 512 binaryDictionary.close(); 513 binaryDictionary = getBinaryDictionary(dictFile); 514 515 for (final Pair<String, String> bigram : bigramWords) { 516 final int bigramProbability = bigramProbabilities.get(bigram); 517 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 518 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 519 assertEquals(bigramProbability, 520 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 521 } 522 } 523 524 @Test testRandomOperationsAndFlashWithGC()525 public void testRandomOperationsAndFlashWithGC() { 526 final int maxUnigramCount = 5000; 527 final int maxBigramCount = 10000; 528 final HashMap<String, String> attributeMap = new HashMap<>(); 529 attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); 530 attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); 531 532 final int flashWithGCIterationCount = 50; 533 final int operationCountInEachIteration = 200; 534 final int initialUnigramCount = 100; 535 final float addUnigramProb = 0.5f; 536 final float addBigramProb = 0.8f; 537 final int codePointSetSize = 30; 538 539 final long seed = System.currentTimeMillis(); 540 final Random random = new Random(seed); 541 final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, 542 attributeMap); 543 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 544 545 final ArrayList<String> words = new ArrayList<>(); 546 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 547 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 548 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 549 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 550 for (int i = 0; i < initialUnigramCount; ++i) { 551 final String word = CodePointUtils.generateWord(random, codePointSet); 552 words.add(word); 553 final int unigramProbability = random.nextInt(0xFF); 554 unigramProbabilities.put(word, unigramProbability); 555 addUnigramWord(binaryDictionary, word, unigramProbability); 556 } 557 binaryDictionary.flushWithGC(); 558 binaryDictionary.close(); 559 560 for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { 561 binaryDictionary = getBinaryDictionary(dictFile); 562 for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { 563 // Add unigram. 564 if (random.nextFloat() < addUnigramProb) { 565 final String word = CodePointUtils.generateWord(random, codePointSet); 566 words.add(word); 567 final int unigramProbability = random.nextInt(0xFF); 568 unigramProbabilities.put(word, unigramProbability); 569 addUnigramWord(binaryDictionary, word, unigramProbability); 570 } 571 // Add bigram. 572 if (random.nextFloat() < addBigramProb && words.size() > 2) { 573 final int word0Index = random.nextInt(words.size()); 574 int word1Index = random.nextInt(words.size() - 1); 575 if (word0Index <= word1Index) { 576 word1Index++; 577 } 578 final String word0 = words.get(word0Index); 579 final String word1 = words.get(word1Index); 580 if (TextUtils.equals(word0, word1)) { 581 continue; 582 } 583 final int unigramProbability = unigramProbabilities.get(word1); 584 final int bigramProbability = 585 unigramProbability + random.nextInt(0xFF - unigramProbability); 586 final Pair<String, String> bigram = new Pair<>(word0, word1); 587 bigramWords.add(bigram); 588 bigramProbabilities.put(bigram, bigramProbability); 589 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 590 } 591 } 592 593 // Test whether the all unigram operations are collectlly handled. 594 for (int i = 0; i < words.size(); i++) { 595 final String word = words.get(i); 596 final int unigramProbability = unigramProbabilities.get(word); 597 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 598 } 599 // Test whether the all bigram operations are collectlly handled. 600 for (int i = 0; i < bigramWords.size(); i++) { 601 final Pair<String, String> bigram = bigramWords.get(i); 602 final int probability; 603 if (bigramProbabilities.containsKey(bigram)) { 604 probability = bigramProbabilities.get(bigram); 605 } else { 606 probability = Dictionary.NOT_A_PROBABILITY; 607 } 608 609 assertEquals(probability, 610 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 611 assertEquals(probability != Dictionary.NOT_A_PROBABILITY, 612 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 613 } 614 binaryDictionary.flushWithGC(); 615 binaryDictionary.close(); 616 } 617 } 618 619 @Test testAddManyUnigramsAndFlushWithGC()620 public void testAddManyUnigramsAndFlushWithGC() { 621 final int flashWithGCIterationCount = 3; 622 final int codePointSetSize = 50; 623 624 final long seed = System.currentTimeMillis(); 625 final Random random = new Random(seed); 626 627 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 628 629 final ArrayList<String> words = new ArrayList<>(); 630 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 631 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 632 633 BinaryDictionary binaryDictionary; 634 for (int i = 0; i < flashWithGCIterationCount; i++) { 635 binaryDictionary = getBinaryDictionary(dictFile); 636 while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 637 final String word = CodePointUtils.generateWord(random, codePointSet); 638 words.add(word); 639 final int unigramProbability = random.nextInt(0xFF); 640 unigramProbabilities.put(word, unigramProbability); 641 addUnigramWord(binaryDictionary, word, unigramProbability); 642 } 643 644 for (int j = 0; j < words.size(); j++) { 645 final String word = words.get(j); 646 final int unigramProbability = unigramProbabilities.get(word); 647 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 648 } 649 650 binaryDictionary.flushWithGC(); 651 binaryDictionary.close(); 652 } 653 } 654 655 @Test testUnigramAndBigramCount()656 public void testUnigramAndBigramCount() { 657 final int maxUnigramCount = 5000; 658 final int maxBigramCount = 10000; 659 final HashMap<String, String> attributeMap = new HashMap<>(); 660 attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); 661 attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); 662 663 final int flashWithGCIterationCount = 10; 664 final int codePointSetSize = 50; 665 final int unigramCountPerIteration = 1000; 666 final int bigramCountPerIteration = 2000; 667 final long seed = System.currentTimeMillis(); 668 final Random random = new Random(seed); 669 final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, 670 attributeMap); 671 672 final ArrayList<String> words = new ArrayList<>(); 673 final HashSet<Pair<String, String>> bigrams = new HashSet<>(); 674 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 675 676 BinaryDictionary binaryDictionary; 677 for (int i = 0; i < flashWithGCIterationCount; i++) { 678 binaryDictionary = getBinaryDictionary(dictFile); 679 for (int j = 0; j < unigramCountPerIteration; j++) { 680 final String word = CodePointUtils.generateWord(random, codePointSet); 681 words.add(word); 682 final int unigramProbability = random.nextInt(0xFF); 683 addUnigramWord(binaryDictionary, word, unigramProbability); 684 } 685 for (int j = 0; j < bigramCountPerIteration; j++) { 686 final String word0 = words.get(random.nextInt(words.size())); 687 final String word1 = words.get(random.nextInt(words.size())); 688 if (TextUtils.equals(word0, word1)) { 689 continue; 690 } 691 bigrams.add(new Pair<>(word0, word1)); 692 final int bigramProbability = random.nextInt(0xF); 693 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 694 } 695 assertEquals(new HashSet<>(words).size(), Integer.parseInt( 696 binaryDictionary.getPropertyForGettingStats( 697 BinaryDictionary.UNIGRAM_COUNT_QUERY))); 698 assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( 699 binaryDictionary.getPropertyForGettingStats( 700 BinaryDictionary.BIGRAM_COUNT_QUERY))); 701 binaryDictionary.flushWithGC(); 702 assertEquals(new HashSet<>(words).size(), Integer.parseInt( 703 binaryDictionary.getPropertyForGettingStats( 704 BinaryDictionary.UNIGRAM_COUNT_QUERY))); 705 assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( 706 binaryDictionary.getPropertyForGettingStats( 707 BinaryDictionary.BIGRAM_COUNT_QUERY))); 708 binaryDictionary.close(); 709 } 710 } 711 712 @Test testGetWordProperties()713 public void testGetWordProperties() { 714 final long seed = System.currentTimeMillis(); 715 final Random random = new Random(seed); 716 final int UNIGRAM_COUNT = 1000; 717 final int BIGRAM_COUNT = 1000; 718 final int codePointSetSize = 20; 719 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 720 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 721 final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 722 723 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", 724 false /* isBeginningOfSentence */); 725 assertFalse(invalidWordProperty.isValid()); 726 727 final ArrayList<String> words = new ArrayList<>(); 728 final HashMap<String, Integer> wordProbabilities = new HashMap<>(); 729 final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); 730 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 731 732 for (int i = 0; i < UNIGRAM_COUNT; i++) { 733 final String word = CodePointUtils.generateWord(random, codePointSet); 734 final int unigramProbability = random.nextInt(0xFF); 735 final boolean isNotAWord = random.nextBoolean(); 736 final boolean isPossiblyOffensive = random.nextBoolean(); 737 // TODO: Add tests for historical info. 738 binaryDictionary.addUnigramEntry(word, unigramProbability, 739 false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, 740 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 741 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 742 binaryDictionary.flushWithGC(); 743 } 744 words.add(word); 745 wordProbabilities.put(word, unigramProbability); 746 final WordProperty wordProperty = binaryDictionary.getWordProperty(word, 747 false /* isBeginningOfSentence */); 748 assertEquals(word, wordProperty.mWord); 749 assertTrue(wordProperty.isValid()); 750 assertEquals(isNotAWord, wordProperty.mIsNotAWord); 751 assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive); 752 assertEquals(false, wordProperty.mHasNgrams); 753 assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); 754 } 755 756 for (int i = 0; i < BIGRAM_COUNT; i++) { 757 final int word0Index = random.nextInt(wordProbabilities.size()); 758 final int word1Index = random.nextInt(wordProbabilities.size()); 759 if (word0Index == word1Index) { 760 continue; 761 } 762 final String word0 = words.get(word0Index); 763 final String word1 = words.get(word1Index); 764 final int unigramProbability = wordProbabilities.get(word1); 765 final int bigramProbability = 766 unigramProbability + random.nextInt(0xFF - unigramProbability); 767 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 768 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 769 binaryDictionary.flushWithGC(); 770 } 771 if (!bigrams.containsKey(word0)) { 772 final HashSet<String> bigramWord1s = new HashSet<>(); 773 bigrams.put(word0, bigramWord1s); 774 } 775 bigrams.get(word0).add(word1); 776 bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability); 777 } 778 779 for (int i = 0; i < words.size(); i++) { 780 final String word0 = words.get(i); 781 if (!bigrams.containsKey(word0)) { 782 continue; 783 } 784 final HashSet<String> bigramWord1s = bigrams.get(word0); 785 final WordProperty wordProperty = binaryDictionary.getWordProperty(word0, 786 false /* isBeginningOfSentence */); 787 assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size()); 788 // TODO: Support ngram. 789 for (final WeightedString bigramTarget : wordProperty.getBigrams()) { 790 final String word1 = bigramTarget.mWord; 791 assertTrue(bigramWord1s.contains(word1)); 792 final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1)); 793 assertEquals(bigramProbability, bigramTarget.getProbability()); 794 } 795 } 796 } 797 798 @Test testIterateAllWords()799 public void testIterateAllWords() { 800 final long seed = System.currentTimeMillis(); 801 final Random random = new Random(seed); 802 final int UNIGRAM_COUNT = 1000; 803 final int BIGRAM_COUNT = 1000; 804 final int codePointSetSize = 20; 805 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 806 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 807 808 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", 809 false /* isBeginningOfSentence */); 810 assertFalse(invalidWordProperty.isValid()); 811 812 final ArrayList<String> words = new ArrayList<>(); 813 final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>(); 814 final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); 815 final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater = 816 new HashMap<>(); 817 818 for (int i = 0; i < UNIGRAM_COUNT; i++) { 819 final String word = CodePointUtils.generateWord(random, codePointSet); 820 final int unigramProbability = random.nextInt(0xFF); 821 addUnigramWord(binaryDictionary, word, unigramProbability); 822 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 823 binaryDictionary.flushWithGC(); 824 } 825 words.add(word); 826 wordProbabilitiesToCheckLater.put(word, unigramProbability); 827 } 828 829 for (int i = 0; i < BIGRAM_COUNT; i++) { 830 final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 831 final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 832 if (word0Index == word1Index) { 833 continue; 834 } 835 final String word0 = words.get(word0Index); 836 final String word1 = words.get(word1Index); 837 final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); 838 final int bigramProbability = 839 unigramProbability + random.nextInt(0xFF - unigramProbability); 840 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 841 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 842 binaryDictionary.flushWithGC(); 843 } 844 if (!bigrams.containsKey(word0)) { 845 final HashSet<String> bigramWord1s = new HashSet<>(); 846 bigrams.put(word0, bigramWord1s); 847 } 848 bigrams.get(word0).add(word1); 849 bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability); 850 } 851 852 final HashSet<String> wordSet = new HashSet<>(words); 853 final HashSet<Pair<String, String>> bigramSet = 854 new HashSet<>(bigramProbabilitiesToCheckLater.keySet()); 855 int token = 0; 856 do { 857 final BinaryDictionary.GetNextWordPropertyResult result = 858 binaryDictionary.getNextWordProperty(token); 859 final WordProperty wordProperty = result.mWordProperty; 860 final String word0 = wordProperty.mWord; 861 assertEquals((int)wordProbabilitiesToCheckLater.get(word0), 862 wordProperty.mProbabilityInfo.mProbability); 863 wordSet.remove(word0); 864 final HashSet<String> bigramWord1s = bigrams.get(word0); 865 // TODO: Support ngram. 866 if (wordProperty.mHasNgrams) { 867 for (final WeightedString bigramTarget : wordProperty.getBigrams()) { 868 final String word1 = bigramTarget.mWord; 869 assertTrue(bigramWord1s.contains(word1)); 870 final Pair<String, String> bigram = new Pair<>(word0, word1); 871 final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram); 872 assertEquals(bigramProbability, bigramTarget.getProbability()); 873 bigramSet.remove(bigram); 874 } 875 } 876 token = result.mNextToken; 877 } while (token != 0); 878 assertTrue(wordSet.isEmpty()); 879 assertTrue(bigramSet.isEmpty()); 880 } 881 882 @Test testPossiblyOffensiveAttributeMaintained()883 public void testPossiblyOffensiveAttributeMaintained() { 884 final BinaryDictionary binaryDictionary = 885 getEmptyBinaryDictionary(FormatSpec.VERSION403); 886 binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0); 887 WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false); 888 assertEquals(true, wordProperty.mIsPossiblyOffensive); 889 } 890 891 @Test testBeginningOfSentence()892 public void testBeginningOfSentence() { 893 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 894 final int dummyProbability = 0; 895 final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE; 896 final int bigramProbability = 200; 897 addUnigramWord(binaryDictionary, "aaa", dummyProbability); 898 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, 899 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 900 assertEquals(bigramProbability, 901 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); 902 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, 903 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 904 addUnigramWord(binaryDictionary, "bbb", dummyProbability); 905 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability, 906 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 907 binaryDictionary.flushWithGC(); 908 assertEquals(bigramProbability, 909 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); 910 assertEquals(bigramProbability, 911 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb")); 912 } 913 } 914