1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin;
18 
19 import static org.junit.Assert.assertEquals;
20 import static org.junit.Assert.assertFalse;
21 import static org.junit.Assert.assertTrue;
22 import static org.junit.Assert.fail;
23 
24 import android.text.TextUtils;
25 import android.util.Pair;
26 
27 import androidx.test.InstrumentationRegistry;
28 import androidx.test.filters.LargeTest;
29 import androidx.test.runner.AndroidJUnit4;
30 
31 import com.android.inputmethod.latin.NgramContext.WordInfo;
32 import com.android.inputmethod.latin.common.CodePointUtils;
33 import com.android.inputmethod.latin.common.FileUtils;
34 import com.android.inputmethod.latin.makedict.DictionaryHeader;
35 import com.android.inputmethod.latin.makedict.FormatSpec;
36 import com.android.inputmethod.latin.makedict.WeightedString;
37 import com.android.inputmethod.latin.makedict.WordProperty;
38 import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
39 
40 import org.junit.After;
41 import org.junit.Before;
42 import org.junit.Test;
43 import org.junit.runner.RunWith;
44 
45 import java.io.File;
46 import java.io.IOException;
47 import java.util.ArrayList;
48 import java.util.HashMap;
49 import java.util.HashSet;
50 import java.util.Locale;
51 import java.util.Random;
52 
53 @LargeTest
54 @RunWith(AndroidJUnit4.class)
55 public class BinaryDictionaryTests {
56     private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
57     private static final String TEST_LOCALE = "test";
58     private static final String DICTIONARY_ID = "TestBinaryDictionary";
59 
60     private HashSet<File> mDictFilesToBeDeleted = new HashSet<>();
61 
62     @Before
setUp()63     public void setUp() throws Exception {
64         mDictFilesToBeDeleted.clear();
65     }
66 
67     @After
tearDown()68     public void tearDown() throws Exception {
69         for (final File dictFile : mDictFilesToBeDeleted) {
70             dictFile.delete();
71         }
72         mDictFilesToBeDeleted.clear();
73     }
74 
createEmptyDictionaryAndGetFile(final int formatVersion)75     private File createEmptyDictionaryAndGetFile(final int formatVersion) {
76         return createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
77                 new HashMap<String, String>());
78     }
79 
createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)80     private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion,
81             final HashMap<String, String> attributeMap) {
82         try {
83             final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion,
84                     attributeMap);
85             mDictFilesToBeDeleted.add(dictFile);
86             return dictFile;
87         } catch (final IOException e) {
88             fail(e.toString());
89         }
90         return null;
91     }
92 
createEmptyVer4DictionaryAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)93     private File createEmptyVer4DictionaryAndGetFile(final int formatVersion,
94             final HashMap<String, String> attributeMap) throws IOException {
95         final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION,
96                 InstrumentationRegistry.getTargetContext().getCacheDir());
97         file.delete();
98         file.mkdir();
99         if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion,
100                 Locale.ENGLISH, attributeMap)) {
101             return file;
102         }
103         throw new IOException("Empty dictionary " + file.getAbsolutePath()
104                 + " cannot be created. Format version: " + formatVersion);
105     }
106 
getBinaryDictionary(final File dictFile)107     private static BinaryDictionary getBinaryDictionary(final File dictFile) {
108         return new BinaryDictionary(dictFile.getAbsolutePath(),
109                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
110                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
111     }
112 
getEmptyBinaryDictionary(final int formatVersion)113     private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) {
114         final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
115         return new BinaryDictionary(dictFile.getAbsolutePath(),
116                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
117                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
118     }
119 
120     @Test
testIsValidDictionary()121     public void testIsValidDictionary() {
122         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
123         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
124         assertTrue("binaryDictionary must be valid for existing valid dictionary file.",
125                 binaryDictionary.isValidDictionary());
126         binaryDictionary.close();
127         assertFalse("binaryDictionary must be invalid after closing.",
128                 binaryDictionary.isValidDictionary());
129         FileUtils.deleteRecursively(dictFile);
130         binaryDictionary = getBinaryDictionary(dictFile);
131         assertFalse("binaryDictionary must be invalid for not existing dictionary file.",
132                 binaryDictionary.isValidDictionary());
133         binaryDictionary.close();
134     }
135 
136     @Test
testConstructingDictionaryOnMemory()137     public void testConstructingDictionaryOnMemory() {
138         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
139         FileUtils.deleteRecursively(dictFile);
140         assertFalse(dictFile.exists());
141         final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
142                 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE,
143                 FormatSpec.VERSION403, new HashMap<String, String>());
144         assertTrue(binaryDictionary.isValidDictionary());
145         assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
146         final int probability = 100;
147         addUnigramWord(binaryDictionary, "word", probability);
148         assertEquals(probability, binaryDictionary.getFrequency("word"));
149         assertFalse(dictFile.exists());
150         binaryDictionary.flush();
151         assertTrue(dictFile.exists());
152         assertTrue(binaryDictionary.isValidDictionary());
153         assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
154         assertEquals(probability, binaryDictionary.getFrequency("word"));
155         binaryDictionary.close();
156     }
157 
158     @Test
testAddTooLongWord()159     public void testAddTooLongWord() {
160         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
161         final StringBuffer stringBuilder = new StringBuffer();
162         for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) {
163             stringBuilder.append('a');
164         }
165         final String validLongWord = stringBuilder.toString();
166         stringBuilder.append('a');
167         final String invalidLongWord = stringBuilder.toString();
168         final int probability = 100;
169         addUnigramWord(binaryDictionary, "aaa", probability);
170         addUnigramWord(binaryDictionary, validLongWord, probability);
171         addUnigramWord(binaryDictionary, invalidLongWord, probability);
172         // Too long short cut.
173         binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */,
174                 false /* isNotAWord */, false /* isPossiblyOffensive */,
175                 BinaryDictionary.NOT_A_VALID_TIMESTAMP);
176         addUnigramWord(binaryDictionary, "abc", probability);
177         final int updatedProbability = 200;
178         // Update.
179         addUnigramWord(binaryDictionary, validLongWord, updatedProbability);
180         addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability);
181         addUnigramWord(binaryDictionary, "abc", updatedProbability);
182 
183         assertEquals(probability, binaryDictionary.getFrequency("aaa"));
184         assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord));
185         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord));
186         assertEquals(updatedProbability, binaryDictionary.getFrequency("abc"));
187     }
188 
addUnigramWord(final BinaryDictionary binaryDictionary, final String word, final int probability)189     private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
190             final int probability) {
191         binaryDictionary.addUnigramEntry(word, probability,
192                 false /* isBeginningOfSentence */, false /* isNotAWord */,
193                 false /* isPossiblyOffensive */,
194                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
195     }
196 
addBigramWords(final BinaryDictionary binaryDictionary, final String word0, final String word1, final int probability)197     private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
198             final String word1, final int probability) {
199         binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability,
200                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
201     }
202 
addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2, final int probability)203     private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0,
204             final String word1, final String word2, final int probability) {
205         binaryDictionary.addNgramEntry(
206                 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2,
207                 probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
208     }
209 
isValidBigram(final BinaryDictionary binaryDictionary, final String word0, final String word1)210     private static boolean isValidBigram(final BinaryDictionary binaryDictionary,
211             final String word0, final String word1) {
212         return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1);
213     }
214 
getBigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1)215     private static int getBigramProbability(final BinaryDictionary binaryDictionary,
216             final String word0,  final String word1) {
217         return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1);
218     }
219 
getTrigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2)220     private static int getTrigramProbability(final BinaryDictionary binaryDictionary,
221             final String word0, final String word1, final String word2) {
222         return binaryDictionary.getNgramProbability(
223                 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2);
224     }
225 
226     @Test
testAddUnigramWord()227     public void testAddUnigramWord() {
228         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
229         final int probability = 100;
230         addUnigramWord(binaryDictionary, "aaa", probability);
231         // Reallocate and create.
232         addUnigramWord(binaryDictionary, "aab", probability);
233         // Insert into children.
234         addUnigramWord(binaryDictionary, "aac", probability);
235         // Make terminal.
236         addUnigramWord(binaryDictionary, "aa", probability);
237         // Create children.
238         addUnigramWord(binaryDictionary, "aaaa", probability);
239         // Reallocate and make termianl.
240         addUnigramWord(binaryDictionary, "a", probability);
241 
242         final int updatedProbability = 200;
243         // Update.
244         addUnigramWord(binaryDictionary, "aaa", updatedProbability);
245 
246         assertEquals(probability, binaryDictionary.getFrequency("aab"));
247         assertEquals(probability, binaryDictionary.getFrequency("aac"));
248         assertEquals(probability, binaryDictionary.getFrequency("aa"));
249         assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
250         assertEquals(probability, binaryDictionary.getFrequency("a"));
251         assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
252     }
253 
254     @Test
testRandomlyAddUnigramWord()255     public void testRandomlyAddUnigramWord() {
256         final int wordCount = 1000;
257         final int codePointSetSize = 50;
258         final long seed = System.currentTimeMillis();
259         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
260 
261         final HashMap<String, Integer> probabilityMap = new HashMap<>();
262         // Test a word that isn't contained within the dictionary.
263         final Random random = new Random(seed);
264         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
265         for (int i = 0; i < wordCount; ++i) {
266             final String word = CodePointUtils.generateWord(random, codePointSet);
267             probabilityMap.put(word, random.nextInt(0xFF));
268         }
269         for (String word : probabilityMap.keySet()) {
270             addUnigramWord(binaryDictionary, word, probabilityMap.get(word));
271         }
272         for (String word : probabilityMap.keySet()) {
273             assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
274         }
275     }
276 
277     @Test
testAddBigramWords()278     public void testAddBigramWords() {
279         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
280 
281         final int unigramProbability = 100;
282         final int bigramProbability = 150;
283         final int updatedBigramProbability = 200;
284         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
285         addUnigramWord(binaryDictionary, "abb", unigramProbability);
286         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
287         addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
288         addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
289         addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
290         addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
291 
292         assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
293         assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc"));
294         assertTrue(isValidBigram(binaryDictionary, "abb", "aaa"));
295         assertTrue(isValidBigram(binaryDictionary, "abb", "bcc"));
296         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
297         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
298         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
299         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
300 
301         addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability);
302         assertEquals(updatedBigramProbability,
303                 getBigramProbability(binaryDictionary, "aaa", "abb"));
304 
305         assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
306         assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
307         assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
308         assertEquals(Dictionary.NOT_A_PROBABILITY,
309                 getBigramProbability(binaryDictionary, "bcc", "aaa"));
310         assertEquals(Dictionary.NOT_A_PROBABILITY,
311                 getBigramProbability(binaryDictionary, "bcc", "bbc"));
312         assertEquals(Dictionary.NOT_A_PROBABILITY,
313                 getBigramProbability(binaryDictionary, "aaa", "aaa"));
314 
315         // Testing bigram link.
316         addUnigramWord(binaryDictionary, "abcde", unigramProbability);
317         addUnigramWord(binaryDictionary, "fghij", unigramProbability);
318         addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability);
319         addUnigramWord(binaryDictionary, "fgh", unigramProbability);
320         addUnigramWord(binaryDictionary, "abc", unigramProbability);
321         addUnigramWord(binaryDictionary, "f", unigramProbability);
322 
323         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij"));
324         assertEquals(Dictionary.NOT_A_PROBABILITY,
325                 getBigramProbability(binaryDictionary, "abcde", "fgh"));
326         addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability);
327         assertEquals(updatedBigramProbability,
328                 getBigramProbability(binaryDictionary, "abcde", "fghij"));
329     }
330 
331     @Test
testRandomlyAddBigramWords()332     public void testRandomlyAddBigramWords() {
333         final int wordCount = 100;
334         final int bigramCount = 1000;
335         final int codePointSetSize = 50;
336         final long seed = System.currentTimeMillis();
337         final Random random = new Random(seed);
338         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
339 
340         final ArrayList<String> words = new ArrayList<>();
341         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
342         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
343         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
344         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
345 
346         for (int i = 0; i < wordCount; ++i) {
347             final String word = CodePointUtils.generateWord(random, codePointSet);
348             words.add(word);
349             final int unigramProbability = random.nextInt(0xFF);
350             unigramProbabilities.put(word, unigramProbability);
351             addUnigramWord(binaryDictionary, word, unigramProbability);
352         }
353 
354         for (int i = 0; i < bigramCount; i++) {
355             final String word0 = words.get(random.nextInt(wordCount));
356             final String word1 = words.get(random.nextInt(wordCount));
357             if (TextUtils.equals(word0, word1)) {
358                 continue;
359             }
360             final Pair<String, String> bigram = new Pair<>(word0, word1);
361             bigramWords.add(bigram);
362             final int unigramProbability = unigramProbabilities.get(word1);
363             final int bigramProbability =
364                     unigramProbability + random.nextInt(0xFF - unigramProbability);
365             bigramProbabilities.put(bigram, bigramProbability);
366             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
367         }
368 
369         for (final Pair<String, String> bigram : bigramWords) {
370             final int bigramProbability = bigramProbabilities.get(bigram);
371             assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
372                     isValidBigram(binaryDictionary, bigram.first, bigram.second));
373             assertEquals(bigramProbability,
374                     getBigramProbability(binaryDictionary, bigram.first, bigram.second));
375         }
376     }
377 
378     @Test
testAddTrigramWords()379     public void testAddTrigramWords() {
380         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
381         final int unigramProbability = 100;
382         final int trigramProbability = 150;
383         final int updatedTrigramProbability = 200;
384         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
385         addUnigramWord(binaryDictionary, "abb", unigramProbability);
386         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
387 
388         addBigramWords(binaryDictionary, "abb", "bcc", 10);
389         addBigramWords(binaryDictionary, "abb", "aaa", 10);
390 
391         addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability);
392         addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability);
393 
394         assertEquals(trigramProbability,
395                 getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc"));
396         assertEquals(trigramProbability,
397                 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
398         assertFalse(isValidBigram(binaryDictionary, "aaa", "abb"));
399 
400         addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability);
401         assertEquals(updatedTrigramProbability,
402                 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
403     }
404 
405     @Test
testFlushDictionary()406     public void testFlushDictionary() {
407         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
408         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
409 
410         final int probability = 100;
411         addUnigramWord(binaryDictionary, "aaa", probability);
412         addUnigramWord(binaryDictionary, "abcd", probability);
413         // Close without flushing.
414         binaryDictionary.close();
415 
416         binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
417                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
418                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
419 
420         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa"));
421         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd"));
422 
423         addUnigramWord(binaryDictionary, "aaa", probability);
424         addUnigramWord(binaryDictionary, "abcd", probability);
425         binaryDictionary.flush();
426         binaryDictionary.close();
427 
428         binaryDictionary = getBinaryDictionary(dictFile);
429         assertEquals(probability, binaryDictionary.getFrequency("aaa"));
430         assertEquals(probability, binaryDictionary.getFrequency("abcd"));
431         addUnigramWord(binaryDictionary, "bcde", probability);
432         binaryDictionary.flush();
433         binaryDictionary.close();
434 
435         binaryDictionary = getBinaryDictionary(dictFile);
436         assertEquals(probability, binaryDictionary.getFrequency("bcde"));
437         binaryDictionary.close();
438     }
439 
440     @Test
testFlushWithGCDictionary()441     public void testFlushWithGCDictionary() {
442         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
443         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
444         final int unigramProbability = 100;
445         final int bigramProbability = 150;
446         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
447         addUnigramWord(binaryDictionary, "abb", unigramProbability);
448         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
449         addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
450         addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
451         addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
452         addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
453         binaryDictionary.flushWithGC();
454         binaryDictionary.close();
455 
456         binaryDictionary = getBinaryDictionary(dictFile);
457         assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
458         assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
459         assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
460         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
461         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
462         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
463         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
464         assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
465         assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
466         assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
467         binaryDictionary.flushWithGC();
468         binaryDictionary.close();
469     }
470 
471     @Test
testAddBigramWordsAndFlashWithGC()472     public void testAddBigramWordsAndFlashWithGC() {
473         final int wordCount = 100;
474         final int bigramCount = 1000;
475         final int codePointSetSize = 30;
476         final long seed = System.currentTimeMillis();
477         final Random random = new Random(seed);
478 
479         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
480         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
481 
482         final ArrayList<String> words = new ArrayList<>();
483         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
484         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
485         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
486         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
487 
488         for (int i = 0; i < wordCount; ++i) {
489             final String word = CodePointUtils.generateWord(random, codePointSet);
490             words.add(word);
491             final int unigramProbability = random.nextInt(0xFF);
492             unigramProbabilities.put(word, unigramProbability);
493             addUnigramWord(binaryDictionary, word, unigramProbability);
494         }
495 
496         for (int i = 0; i < bigramCount; i++) {
497             final String word0 = words.get(random.nextInt(wordCount));
498             final String word1 = words.get(random.nextInt(wordCount));
499             if (TextUtils.equals(word0, word1)) {
500                 continue;
501             }
502             final Pair<String, String> bigram = new Pair<>(word0, word1);
503             bigramWords.add(bigram);
504             final int unigramProbability = unigramProbabilities.get(word1);
505             final int bigramProbability =
506                     unigramProbability + random.nextInt(0xFF - unigramProbability);
507             bigramProbabilities.put(bigram, bigramProbability);
508             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
509         }
510 
511         binaryDictionary.flushWithGC();
512         binaryDictionary.close();
513         binaryDictionary = getBinaryDictionary(dictFile);
514 
515         for (final Pair<String, String> bigram : bigramWords) {
516             final int bigramProbability = bigramProbabilities.get(bigram);
517             assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
518                     isValidBigram(binaryDictionary, bigram.first, bigram.second));
519             assertEquals(bigramProbability,
520                     getBigramProbability(binaryDictionary, bigram.first, bigram.second));
521         }
522     }
523 
524     @Test
testRandomOperationsAndFlashWithGC()525     public void testRandomOperationsAndFlashWithGC() {
526         final int maxUnigramCount = 5000;
527         final int maxBigramCount = 10000;
528         final HashMap<String, String> attributeMap = new HashMap<>();
529         attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
530         attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
531 
532         final int flashWithGCIterationCount = 50;
533         final int operationCountInEachIteration = 200;
534         final int initialUnigramCount = 100;
535         final float addUnigramProb = 0.5f;
536         final float addBigramProb = 0.8f;
537         final int codePointSetSize = 30;
538 
539         final long seed = System.currentTimeMillis();
540         final Random random = new Random(seed);
541         final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
542                 attributeMap);
543         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
544 
545         final ArrayList<String> words = new ArrayList<>();
546         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
547         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
548         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
549         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
550         for (int i = 0; i < initialUnigramCount; ++i) {
551             final String word = CodePointUtils.generateWord(random, codePointSet);
552             words.add(word);
553             final int unigramProbability = random.nextInt(0xFF);
554             unigramProbabilities.put(word, unigramProbability);
555             addUnigramWord(binaryDictionary, word, unigramProbability);
556         }
557         binaryDictionary.flushWithGC();
558         binaryDictionary.close();
559 
560         for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) {
561             binaryDictionary = getBinaryDictionary(dictFile);
562             for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) {
563                 // Add unigram.
564                 if (random.nextFloat() < addUnigramProb) {
565                     final String word = CodePointUtils.generateWord(random, codePointSet);
566                     words.add(word);
567                     final int unigramProbability = random.nextInt(0xFF);
568                     unigramProbabilities.put(word, unigramProbability);
569                     addUnigramWord(binaryDictionary, word, unigramProbability);
570                 }
571                 // Add bigram.
572                 if (random.nextFloat() < addBigramProb && words.size() > 2) {
573                     final int word0Index = random.nextInt(words.size());
574                     int word1Index = random.nextInt(words.size() - 1);
575                     if (word0Index <= word1Index) {
576                         word1Index++;
577                     }
578                     final String word0 = words.get(word0Index);
579                     final String word1 = words.get(word1Index);
580                     if (TextUtils.equals(word0, word1)) {
581                         continue;
582                     }
583                     final int unigramProbability = unigramProbabilities.get(word1);
584                     final int bigramProbability =
585                             unigramProbability + random.nextInt(0xFF - unigramProbability);
586                     final Pair<String, String> bigram = new Pair<>(word0, word1);
587                     bigramWords.add(bigram);
588                     bigramProbabilities.put(bigram, bigramProbability);
589                     addBigramWords(binaryDictionary, word0, word1, bigramProbability);
590                 }
591             }
592 
593             // Test whether the all unigram operations are collectlly handled.
594             for (int i = 0; i < words.size(); i++) {
595                 final String word = words.get(i);
596                 final int unigramProbability = unigramProbabilities.get(word);
597                 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
598             }
599             // Test whether the all bigram operations are collectlly handled.
600             for (int i = 0; i < bigramWords.size(); i++) {
601                 final Pair<String, String> bigram = bigramWords.get(i);
602                 final int probability;
603                 if (bigramProbabilities.containsKey(bigram)) {
604                     probability = bigramProbabilities.get(bigram);
605                 } else {
606                     probability = Dictionary.NOT_A_PROBABILITY;
607                 }
608 
609                 assertEquals(probability,
610                         getBigramProbability(binaryDictionary, bigram.first, bigram.second));
611                 assertEquals(probability != Dictionary.NOT_A_PROBABILITY,
612                         isValidBigram(binaryDictionary, bigram.first, bigram.second));
613             }
614             binaryDictionary.flushWithGC();
615             binaryDictionary.close();
616         }
617     }
618 
619     @Test
testAddManyUnigramsAndFlushWithGC()620     public void testAddManyUnigramsAndFlushWithGC() {
621         final int flashWithGCIterationCount = 3;
622         final int codePointSetSize = 50;
623 
624         final long seed = System.currentTimeMillis();
625         final Random random = new Random(seed);
626 
627         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
628 
629         final ArrayList<String> words = new ArrayList<>();
630         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
631         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
632 
633         BinaryDictionary binaryDictionary;
634         for (int i = 0; i < flashWithGCIterationCount; i++) {
635             binaryDictionary = getBinaryDictionary(dictFile);
636             while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
637                 final String word = CodePointUtils.generateWord(random, codePointSet);
638                 words.add(word);
639                 final int unigramProbability = random.nextInt(0xFF);
640                 unigramProbabilities.put(word, unigramProbability);
641                 addUnigramWord(binaryDictionary, word, unigramProbability);
642             }
643 
644             for (int j = 0; j < words.size(); j++) {
645                 final String word = words.get(j);
646                 final int unigramProbability = unigramProbabilities.get(word);
647                 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
648             }
649 
650             binaryDictionary.flushWithGC();
651             binaryDictionary.close();
652         }
653     }
654 
655     @Test
testUnigramAndBigramCount()656     public void testUnigramAndBigramCount() {
657         final int maxUnigramCount = 5000;
658         final int maxBigramCount = 10000;
659         final HashMap<String, String> attributeMap = new HashMap<>();
660         attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
661         attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
662 
663         final int flashWithGCIterationCount = 10;
664         final int codePointSetSize = 50;
665         final int unigramCountPerIteration = 1000;
666         final int bigramCountPerIteration = 2000;
667         final long seed = System.currentTimeMillis();
668         final Random random = new Random(seed);
669         final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
670                 attributeMap);
671 
672         final ArrayList<String> words = new ArrayList<>();
673         final HashSet<Pair<String, String>> bigrams = new HashSet<>();
674         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
675 
676         BinaryDictionary binaryDictionary;
677         for (int i = 0; i < flashWithGCIterationCount; i++) {
678             binaryDictionary = getBinaryDictionary(dictFile);
679             for (int j = 0; j < unigramCountPerIteration; j++) {
680                 final String word = CodePointUtils.generateWord(random, codePointSet);
681                 words.add(word);
682                 final int unigramProbability = random.nextInt(0xFF);
683                 addUnigramWord(binaryDictionary, word, unigramProbability);
684             }
685             for (int j = 0; j < bigramCountPerIteration; j++) {
686                 final String word0 = words.get(random.nextInt(words.size()));
687                 final String word1 = words.get(random.nextInt(words.size()));
688                 if (TextUtils.equals(word0, word1)) {
689                     continue;
690                 }
691                 bigrams.add(new Pair<>(word0, word1));
692                 final int bigramProbability = random.nextInt(0xF);
693                 addBigramWords(binaryDictionary, word0, word1, bigramProbability);
694             }
695             assertEquals(new HashSet<>(words).size(), Integer.parseInt(
696                     binaryDictionary.getPropertyForGettingStats(
697                             BinaryDictionary.UNIGRAM_COUNT_QUERY)));
698             assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
699                     binaryDictionary.getPropertyForGettingStats(
700                             BinaryDictionary.BIGRAM_COUNT_QUERY)));
701             binaryDictionary.flushWithGC();
702             assertEquals(new HashSet<>(words).size(), Integer.parseInt(
703                     binaryDictionary.getPropertyForGettingStats(
704                             BinaryDictionary.UNIGRAM_COUNT_QUERY)));
705             assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
706                     binaryDictionary.getPropertyForGettingStats(
707                             BinaryDictionary.BIGRAM_COUNT_QUERY)));
708             binaryDictionary.close();
709         }
710     }
711 
712     @Test
testGetWordProperties()713     public void testGetWordProperties() {
714         final long seed = System.currentTimeMillis();
715         final Random random = new Random(seed);
716         final int UNIGRAM_COUNT = 1000;
717         final int BIGRAM_COUNT = 1000;
718         final int codePointSetSize = 20;
719         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
720         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
721         final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
722 
723         final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
724                 false /* isBeginningOfSentence */);
725         assertFalse(invalidWordProperty.isValid());
726 
727         final ArrayList<String> words = new ArrayList<>();
728         final HashMap<String, Integer> wordProbabilities = new HashMap<>();
729         final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
730         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
731 
732         for (int i = 0; i < UNIGRAM_COUNT; i++) {
733             final String word = CodePointUtils.generateWord(random, codePointSet);
734             final int unigramProbability = random.nextInt(0xFF);
735             final boolean isNotAWord = random.nextBoolean();
736             final boolean isPossiblyOffensive = random.nextBoolean();
737             // TODO: Add tests for historical info.
738             binaryDictionary.addUnigramEntry(word, unigramProbability,
739                     false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive,
740                     BinaryDictionary.NOT_A_VALID_TIMESTAMP);
741             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
742                 binaryDictionary.flushWithGC();
743             }
744             words.add(word);
745             wordProbabilities.put(word, unigramProbability);
746             final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
747                     false /* isBeginningOfSentence */);
748             assertEquals(word, wordProperty.mWord);
749             assertTrue(wordProperty.isValid());
750             assertEquals(isNotAWord, wordProperty.mIsNotAWord);
751             assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive);
752             assertEquals(false, wordProperty.mHasNgrams);
753             assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
754         }
755 
756         for (int i = 0; i < BIGRAM_COUNT; i++) {
757             final int word0Index = random.nextInt(wordProbabilities.size());
758             final int word1Index = random.nextInt(wordProbabilities.size());
759             if (word0Index == word1Index) {
760                 continue;
761             }
762             final String word0 = words.get(word0Index);
763             final String word1 = words.get(word1Index);
764             final int unigramProbability = wordProbabilities.get(word1);
765             final int bigramProbability =
766                     unigramProbability + random.nextInt(0xFF - unigramProbability);
767             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
768             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
769                 binaryDictionary.flushWithGC();
770             }
771             if (!bigrams.containsKey(word0)) {
772                 final HashSet<String> bigramWord1s = new HashSet<>();
773                 bigrams.put(word0, bigramWord1s);
774             }
775             bigrams.get(word0).add(word1);
776             bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability);
777         }
778 
779         for (int i = 0; i < words.size(); i++) {
780             final String word0 = words.get(i);
781             if (!bigrams.containsKey(word0)) {
782                 continue;
783             }
784             final HashSet<String> bigramWord1s = bigrams.get(word0);
785             final WordProperty wordProperty = binaryDictionary.getWordProperty(word0,
786                     false /* isBeginningOfSentence */);
787             assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size());
788             // TODO: Support ngram.
789             for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
790                 final String word1 = bigramTarget.mWord;
791                 assertTrue(bigramWord1s.contains(word1));
792                 final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1));
793                 assertEquals(bigramProbability, bigramTarget.getProbability());
794             }
795         }
796     }
797 
798     @Test
testIterateAllWords()799     public void testIterateAllWords() {
800         final long seed = System.currentTimeMillis();
801         final Random random = new Random(seed);
802         final int UNIGRAM_COUNT = 1000;
803         final int BIGRAM_COUNT = 1000;
804         final int codePointSetSize = 20;
805         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
806         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
807 
808         final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
809                 false /* isBeginningOfSentence */);
810         assertFalse(invalidWordProperty.isValid());
811 
812         final ArrayList<String> words = new ArrayList<>();
813         final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>();
814         final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
815         final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
816                 new HashMap<>();
817 
818         for (int i = 0; i < UNIGRAM_COUNT; i++) {
819             final String word = CodePointUtils.generateWord(random, codePointSet);
820             final int unigramProbability = random.nextInt(0xFF);
821             addUnigramWord(binaryDictionary, word, unigramProbability);
822             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
823                 binaryDictionary.flushWithGC();
824             }
825             words.add(word);
826             wordProbabilitiesToCheckLater.put(word, unigramProbability);
827         }
828 
829         for (int i = 0; i < BIGRAM_COUNT; i++) {
830             final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
831             final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
832             if (word0Index == word1Index) {
833                 continue;
834             }
835             final String word0 = words.get(word0Index);
836             final String word1 = words.get(word1Index);
837             final int unigramProbability = wordProbabilitiesToCheckLater.get(word1);
838             final int bigramProbability =
839                     unigramProbability + random.nextInt(0xFF - unigramProbability);
840             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
841             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
842                 binaryDictionary.flushWithGC();
843             }
844             if (!bigrams.containsKey(word0)) {
845                 final HashSet<String> bigramWord1s = new HashSet<>();
846                 bigrams.put(word0, bigramWord1s);
847             }
848             bigrams.get(word0).add(word1);
849             bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability);
850         }
851 
852         final HashSet<String> wordSet = new HashSet<>(words);
853         final HashSet<Pair<String, String>> bigramSet =
854                 new HashSet<>(bigramProbabilitiesToCheckLater.keySet());
855         int token = 0;
856         do {
857             final BinaryDictionary.GetNextWordPropertyResult result =
858                     binaryDictionary.getNextWordProperty(token);
859             final WordProperty wordProperty = result.mWordProperty;
860             final String word0 = wordProperty.mWord;
861             assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
862                     wordProperty.mProbabilityInfo.mProbability);
863             wordSet.remove(word0);
864             final HashSet<String> bigramWord1s = bigrams.get(word0);
865             // TODO: Support ngram.
866             if (wordProperty.mHasNgrams) {
867                 for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
868                     final String word1 = bigramTarget.mWord;
869                     assertTrue(bigramWord1s.contains(word1));
870                     final Pair<String, String> bigram = new Pair<>(word0, word1);
871                     final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram);
872                     assertEquals(bigramProbability, bigramTarget.getProbability());
873                     bigramSet.remove(bigram);
874                 }
875             }
876             token = result.mNextToken;
877         } while (token != 0);
878         assertTrue(wordSet.isEmpty());
879         assertTrue(bigramSet.isEmpty());
880     }
881 
882     @Test
testPossiblyOffensiveAttributeMaintained()883     public void testPossiblyOffensiveAttributeMaintained() {
884         final BinaryDictionary binaryDictionary =
885                 getEmptyBinaryDictionary(FormatSpec.VERSION403);
886         binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0);
887         WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false);
888         assertEquals(true, wordProperty.mIsPossiblyOffensive);
889     }
890 
891     @Test
testBeginningOfSentence()892     public void testBeginningOfSentence() {
893         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
894         final int dummyProbability = 0;
895         final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
896         final int bigramProbability = 200;
897         addUnigramWord(binaryDictionary, "aaa", dummyProbability);
898         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
899                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
900         assertEquals(bigramProbability,
901                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
902         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
903                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
904         addUnigramWord(binaryDictionary, "bbb", dummyProbability);
905         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability,
906                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
907         binaryDictionary.flushWithGC();
908         assertEquals(bigramProbability,
909                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
910         assertEquals(bigramProbability,
911                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb"));
912     }
913 }
914