1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin;
18 
19 import android.test.AndroidTestCase;
20 import android.test.suitebuilder.annotation.LargeTest;
21 import android.text.TextUtils;
22 import android.util.Pair;
23 
24 import com.android.inputmethod.latin.NgramContext.WordInfo;
25 import com.android.inputmethod.latin.common.CodePointUtils;
26 import com.android.inputmethod.latin.common.FileUtils;
27 import com.android.inputmethod.latin.makedict.DictionaryHeader;
28 import com.android.inputmethod.latin.makedict.FormatSpec;
29 import com.android.inputmethod.latin.makedict.WeightedString;
30 import com.android.inputmethod.latin.makedict.WordProperty;
31 import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
32 
33 import java.io.File;
34 import java.io.IOException;
35 import java.util.ArrayList;
36 import java.util.HashMap;
37 import java.util.HashSet;
38 import java.util.Locale;
39 import java.util.Random;
40 
41 @LargeTest
42 public class BinaryDictionaryTests extends AndroidTestCase {
43     private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
44     private static final String TEST_LOCALE = "test";
45     private static final String DICTIONARY_ID = "TestBinaryDictionary";
46 
47     private HashSet<File> mDictFilesToBeDeleted = new HashSet<>();
48 
49     @Override
setUp()50     protected void setUp() throws Exception {
51         super.setUp();
52         mDictFilesToBeDeleted.clear();
53     }
54 
55     @Override
tearDown()56     protected void tearDown() throws Exception {
57         for (final File dictFile : mDictFilesToBeDeleted) {
58             dictFile.delete();
59         }
60         mDictFilesToBeDeleted.clear();
61         super.tearDown();
62     }
63 
createEmptyDictionaryAndGetFile(final int formatVersion)64     private File createEmptyDictionaryAndGetFile(final int formatVersion) {
65         return createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
66                 new HashMap<String, String>());
67     }
68 
createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)69     private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion,
70             final HashMap<String, String> attributeMap) {
71         try {
72             final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion,
73                     attributeMap);
74             mDictFilesToBeDeleted.add(dictFile);
75             return dictFile;
76         } catch (final IOException e) {
77             fail(e.toString());
78         }
79         return null;
80     }
81 
createEmptyVer4DictionaryAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap)82     private File createEmptyVer4DictionaryAndGetFile(final int formatVersion,
83             final HashMap<String, String> attributeMap) throws IOException {
84         final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION,
85                 getContext().getCacheDir());
86         file.delete();
87         file.mkdir();
88         if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion,
89                 Locale.ENGLISH, attributeMap)) {
90             return file;
91         }
92         throw new IOException("Empty dictionary " + file.getAbsolutePath()
93                 + " cannot be created. Format version: " + formatVersion);
94     }
95 
getBinaryDictionary(final File dictFile)96     private static BinaryDictionary getBinaryDictionary(final File dictFile) {
97         return new BinaryDictionary(dictFile.getAbsolutePath(),
98                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
99                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
100     }
101 
getEmptyBinaryDictionary(final int formatVersion)102     private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) {
103         final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
104         return new BinaryDictionary(dictFile.getAbsolutePath(),
105                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
106                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
107     }
108 
testIsValidDictionary()109     public void testIsValidDictionary() {
110         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
111         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
112         assertTrue("binaryDictionary must be valid for existing valid dictionary file.",
113                 binaryDictionary.isValidDictionary());
114         binaryDictionary.close();
115         assertFalse("binaryDictionary must be invalid after closing.",
116                 binaryDictionary.isValidDictionary());
117         FileUtils.deleteRecursively(dictFile);
118         binaryDictionary = getBinaryDictionary(dictFile);
119         assertFalse("binaryDictionary must be invalid for not existing dictionary file.",
120                 binaryDictionary.isValidDictionary());
121         binaryDictionary.close();
122     }
123 
testConstructingDictionaryOnMemory()124     public void testConstructingDictionaryOnMemory() {
125         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
126         FileUtils.deleteRecursively(dictFile);
127         assertFalse(dictFile.exists());
128         final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
129                 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE,
130                 FormatSpec.VERSION403, new HashMap<String, String>());
131         assertTrue(binaryDictionary.isValidDictionary());
132         assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
133         final int probability = 100;
134         addUnigramWord(binaryDictionary, "word", probability);
135         assertEquals(probability, binaryDictionary.getFrequency("word"));
136         assertFalse(dictFile.exists());
137         binaryDictionary.flush();
138         assertTrue(dictFile.exists());
139         assertTrue(binaryDictionary.isValidDictionary());
140         assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
141         assertEquals(probability, binaryDictionary.getFrequency("word"));
142         binaryDictionary.close();
143     }
144 
testAddTooLongWord()145     public void testAddTooLongWord() {
146         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
147         final StringBuffer stringBuilder = new StringBuffer();
148         for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) {
149             stringBuilder.append('a');
150         }
151         final String validLongWord = stringBuilder.toString();
152         stringBuilder.append('a');
153         final String invalidLongWord = stringBuilder.toString();
154         final int probability = 100;
155         addUnigramWord(binaryDictionary, "aaa", probability);
156         addUnigramWord(binaryDictionary, validLongWord, probability);
157         addUnigramWord(binaryDictionary, invalidLongWord, probability);
158         // Too long short cut.
159         binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */,
160                 false /* isNotAWord */, false /* isPossiblyOffensive */,
161                 BinaryDictionary.NOT_A_VALID_TIMESTAMP);
162         addUnigramWord(binaryDictionary, "abc", probability);
163         final int updatedProbability = 200;
164         // Update.
165         addUnigramWord(binaryDictionary, validLongWord, updatedProbability);
166         addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability);
167         addUnigramWord(binaryDictionary, "abc", updatedProbability);
168 
169         assertEquals(probability, binaryDictionary.getFrequency("aaa"));
170         assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord));
171         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord));
172         assertEquals(updatedProbability, binaryDictionary.getFrequency("abc"));
173     }
174 
addUnigramWord(final BinaryDictionary binaryDictionary, final String word, final int probability)175     private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
176             final int probability) {
177         binaryDictionary.addUnigramEntry(word, probability,
178                 false /* isBeginningOfSentence */, false /* isNotAWord */,
179                 false /* isPossiblyOffensive */,
180                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
181     }
182 
addBigramWords(final BinaryDictionary binaryDictionary, final String word0, final String word1, final int probability)183     private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
184             final String word1, final int probability) {
185         binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability,
186                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
187     }
188 
addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2, final int probability)189     private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0,
190             final String word1, final String word2, final int probability) {
191         binaryDictionary.addNgramEntry(
192                 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2,
193                 probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
194     }
195 
isValidBigram(final BinaryDictionary binaryDictionary, final String word0, final String word1)196     private static boolean isValidBigram(final BinaryDictionary binaryDictionary,
197             final String word0, final String word1) {
198         return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1);
199     }
200 
getBigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1)201     private static int getBigramProbability(final BinaryDictionary binaryDictionary,
202             final String word0,  final String word1) {
203         return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1);
204     }
205 
getTrigramProbability(final BinaryDictionary binaryDictionary, final String word0, final String word1, final String word2)206     private static int getTrigramProbability(final BinaryDictionary binaryDictionary,
207             final String word0, final String word1, final String word2) {
208         return binaryDictionary.getNgramProbability(
209                 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2);
210     }
211 
testAddUnigramWord()212     public void testAddUnigramWord() {
213         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
214         final int probability = 100;
215         addUnigramWord(binaryDictionary, "aaa", probability);
216         // Reallocate and create.
217         addUnigramWord(binaryDictionary, "aab", probability);
218         // Insert into children.
219         addUnigramWord(binaryDictionary, "aac", probability);
220         // Make terminal.
221         addUnigramWord(binaryDictionary, "aa", probability);
222         // Create children.
223         addUnigramWord(binaryDictionary, "aaaa", probability);
224         // Reallocate and make termianl.
225         addUnigramWord(binaryDictionary, "a", probability);
226 
227         final int updatedProbability = 200;
228         // Update.
229         addUnigramWord(binaryDictionary, "aaa", updatedProbability);
230 
231         assertEquals(probability, binaryDictionary.getFrequency("aab"));
232         assertEquals(probability, binaryDictionary.getFrequency("aac"));
233         assertEquals(probability, binaryDictionary.getFrequency("aa"));
234         assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
235         assertEquals(probability, binaryDictionary.getFrequency("a"));
236         assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
237     }
238 
testRandomlyAddUnigramWord()239     public void testRandomlyAddUnigramWord() {
240         final int wordCount = 1000;
241         final int codePointSetSize = 50;
242         final long seed = System.currentTimeMillis();
243         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
244 
245         final HashMap<String, Integer> probabilityMap = new HashMap<>();
246         // Test a word that isn't contained within the dictionary.
247         final Random random = new Random(seed);
248         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
249         for (int i = 0; i < wordCount; ++i) {
250             final String word = CodePointUtils.generateWord(random, codePointSet);
251             probabilityMap.put(word, random.nextInt(0xFF));
252         }
253         for (String word : probabilityMap.keySet()) {
254             addUnigramWord(binaryDictionary, word, probabilityMap.get(word));
255         }
256         for (String word : probabilityMap.keySet()) {
257             assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
258         }
259     }
260 
testAddBigramWords()261     public void testAddBigramWords() {
262         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
263 
264         final int unigramProbability = 100;
265         final int bigramProbability = 150;
266         final int updatedBigramProbability = 200;
267         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
268         addUnigramWord(binaryDictionary, "abb", unigramProbability);
269         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
270         addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
271         addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
272         addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
273         addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
274 
275         assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
276         assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc"));
277         assertTrue(isValidBigram(binaryDictionary, "abb", "aaa"));
278         assertTrue(isValidBigram(binaryDictionary, "abb", "bcc"));
279         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
280         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
281         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
282         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
283 
284         addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability);
285         assertEquals(updatedBigramProbability,
286                 getBigramProbability(binaryDictionary, "aaa", "abb"));
287 
288         assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
289         assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
290         assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
291         assertEquals(Dictionary.NOT_A_PROBABILITY,
292                 getBigramProbability(binaryDictionary, "bcc", "aaa"));
293         assertEquals(Dictionary.NOT_A_PROBABILITY,
294                 getBigramProbability(binaryDictionary, "bcc", "bbc"));
295         assertEquals(Dictionary.NOT_A_PROBABILITY,
296                 getBigramProbability(binaryDictionary, "aaa", "aaa"));
297 
298         // Testing bigram link.
299         addUnigramWord(binaryDictionary, "abcde", unigramProbability);
300         addUnigramWord(binaryDictionary, "fghij", unigramProbability);
301         addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability);
302         addUnigramWord(binaryDictionary, "fgh", unigramProbability);
303         addUnigramWord(binaryDictionary, "abc", unigramProbability);
304         addUnigramWord(binaryDictionary, "f", unigramProbability);
305 
306         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij"));
307         assertEquals(Dictionary.NOT_A_PROBABILITY,
308                 getBigramProbability(binaryDictionary, "abcde", "fgh"));
309         addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability);
310         assertEquals(updatedBigramProbability,
311                 getBigramProbability(binaryDictionary, "abcde", "fghij"));
312     }
313 
testRandomlyAddBigramWords()314     public void testRandomlyAddBigramWords() {
315         final int wordCount = 100;
316         final int bigramCount = 1000;
317         final int codePointSetSize = 50;
318         final long seed = System.currentTimeMillis();
319         final Random random = new Random(seed);
320         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
321 
322         final ArrayList<String> words = new ArrayList<>();
323         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
324         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
325         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
326         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
327 
328         for (int i = 0; i < wordCount; ++i) {
329             final String word = CodePointUtils.generateWord(random, codePointSet);
330             words.add(word);
331             final int unigramProbability = random.nextInt(0xFF);
332             unigramProbabilities.put(word, unigramProbability);
333             addUnigramWord(binaryDictionary, word, unigramProbability);
334         }
335 
336         for (int i = 0; i < bigramCount; i++) {
337             final String word0 = words.get(random.nextInt(wordCount));
338             final String word1 = words.get(random.nextInt(wordCount));
339             if (TextUtils.equals(word0, word1)) {
340                 continue;
341             }
342             final Pair<String, String> bigram = new Pair<>(word0, word1);
343             bigramWords.add(bigram);
344             final int unigramProbability = unigramProbabilities.get(word1);
345             final int bigramProbability =
346                     unigramProbability + random.nextInt(0xFF - unigramProbability);
347             bigramProbabilities.put(bigram, bigramProbability);
348             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
349         }
350 
351         for (final Pair<String, String> bigram : bigramWords) {
352             final int bigramProbability = bigramProbabilities.get(bigram);
353             assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
354                     isValidBigram(binaryDictionary, bigram.first, bigram.second));
355             assertEquals(bigramProbability,
356                     getBigramProbability(binaryDictionary, bigram.first, bigram.second));
357         }
358     }
359 
testAddTrigramWords()360     public void testAddTrigramWords() {
361         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
362         final int unigramProbability = 100;
363         final int trigramProbability = 150;
364         final int updatedTrigramProbability = 200;
365         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
366         addUnigramWord(binaryDictionary, "abb", unigramProbability);
367         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
368 
369         addBigramWords(binaryDictionary, "abb", "bcc", 10);
370         addBigramWords(binaryDictionary, "abb", "aaa", 10);
371 
372         addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability);
373         addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability);
374 
375         assertEquals(trigramProbability,
376                 getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc"));
377         assertEquals(trigramProbability,
378                 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
379         assertFalse(isValidBigram(binaryDictionary, "aaa", "abb"));
380 
381         addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability);
382         assertEquals(updatedTrigramProbability,
383                 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
384     }
385 
testFlushDictionary()386     public void testFlushDictionary() {
387         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
388         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
389 
390         final int probability = 100;
391         addUnigramWord(binaryDictionary, "aaa", probability);
392         addUnigramWord(binaryDictionary, "abcd", probability);
393         // Close without flushing.
394         binaryDictionary.close();
395 
396         binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
397                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
398                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
399 
400         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa"));
401         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd"));
402 
403         addUnigramWord(binaryDictionary, "aaa", probability);
404         addUnigramWord(binaryDictionary, "abcd", probability);
405         binaryDictionary.flush();
406         binaryDictionary.close();
407 
408         binaryDictionary = getBinaryDictionary(dictFile);
409         assertEquals(probability, binaryDictionary.getFrequency("aaa"));
410         assertEquals(probability, binaryDictionary.getFrequency("abcd"));
411         addUnigramWord(binaryDictionary, "bcde", probability);
412         binaryDictionary.flush();
413         binaryDictionary.close();
414 
415         binaryDictionary = getBinaryDictionary(dictFile);
416         assertEquals(probability, binaryDictionary.getFrequency("bcde"));
417         binaryDictionary.close();
418     }
419 
testFlushWithGCDictionary()420     public void testFlushWithGCDictionary() {
421         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
422         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
423         final int unigramProbability = 100;
424         final int bigramProbability = 150;
425         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
426         addUnigramWord(binaryDictionary, "abb", unigramProbability);
427         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
428         addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
429         addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
430         addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
431         addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
432         binaryDictionary.flushWithGC();
433         binaryDictionary.close();
434 
435         binaryDictionary = getBinaryDictionary(dictFile);
436         assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
437         assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
438         assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
439         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
440         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
441         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
442         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
443         assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
444         assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
445         assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
446         binaryDictionary.flushWithGC();
447         binaryDictionary.close();
448     }
449 
testAddBigramWordsAndFlashWithGC()450     public void testAddBigramWordsAndFlashWithGC() {
451         final int wordCount = 100;
452         final int bigramCount = 1000;
453         final int codePointSetSize = 30;
454         final long seed = System.currentTimeMillis();
455         final Random random = new Random(seed);
456 
457         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
458         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
459 
460         final ArrayList<String> words = new ArrayList<>();
461         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
462         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
463         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
464         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
465 
466         for (int i = 0; i < wordCount; ++i) {
467             final String word = CodePointUtils.generateWord(random, codePointSet);
468             words.add(word);
469             final int unigramProbability = random.nextInt(0xFF);
470             unigramProbabilities.put(word, unigramProbability);
471             addUnigramWord(binaryDictionary, word, unigramProbability);
472         }
473 
474         for (int i = 0; i < bigramCount; i++) {
475             final String word0 = words.get(random.nextInt(wordCount));
476             final String word1 = words.get(random.nextInt(wordCount));
477             if (TextUtils.equals(word0, word1)) {
478                 continue;
479             }
480             final Pair<String, String> bigram = new Pair<>(word0, word1);
481             bigramWords.add(bigram);
482             final int unigramProbability = unigramProbabilities.get(word1);
483             final int bigramProbability =
484                     unigramProbability + random.nextInt(0xFF - unigramProbability);
485             bigramProbabilities.put(bigram, bigramProbability);
486             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
487         }
488 
489         binaryDictionary.flushWithGC();
490         binaryDictionary.close();
491         binaryDictionary = getBinaryDictionary(dictFile);
492 
493         for (final Pair<String, String> bigram : bigramWords) {
494             final int bigramProbability = bigramProbabilities.get(bigram);
495             assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
496                     isValidBigram(binaryDictionary, bigram.first, bigram.second));
497             assertEquals(bigramProbability,
498                     getBigramProbability(binaryDictionary, bigram.first, bigram.second));
499         }
500     }
501 
testRandomOperationsAndFlashWithGC()502     public void testRandomOperationsAndFlashWithGC() {
503         final int maxUnigramCount = 5000;
504         final int maxBigramCount = 10000;
505         final HashMap<String, String> attributeMap = new HashMap<>();
506         attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
507         attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
508 
509         final int flashWithGCIterationCount = 50;
510         final int operationCountInEachIteration = 200;
511         final int initialUnigramCount = 100;
512         final float addUnigramProb = 0.5f;
513         final float addBigramProb = 0.8f;
514         final int codePointSetSize = 30;
515 
516         final long seed = System.currentTimeMillis();
517         final Random random = new Random(seed);
518         final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
519                 attributeMap);
520         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
521 
522         final ArrayList<String> words = new ArrayList<>();
523         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
524         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
525         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
526         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
527         for (int i = 0; i < initialUnigramCount; ++i) {
528             final String word = CodePointUtils.generateWord(random, codePointSet);
529             words.add(word);
530             final int unigramProbability = random.nextInt(0xFF);
531             unigramProbabilities.put(word, unigramProbability);
532             addUnigramWord(binaryDictionary, word, unigramProbability);
533         }
534         binaryDictionary.flushWithGC();
535         binaryDictionary.close();
536 
537         for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) {
538             binaryDictionary = getBinaryDictionary(dictFile);
539             for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) {
540                 // Add unigram.
541                 if (random.nextFloat() < addUnigramProb) {
542                     final String word = CodePointUtils.generateWord(random, codePointSet);
543                     words.add(word);
544                     final int unigramProbability = random.nextInt(0xFF);
545                     unigramProbabilities.put(word, unigramProbability);
546                     addUnigramWord(binaryDictionary, word, unigramProbability);
547                 }
548                 // Add bigram.
549                 if (random.nextFloat() < addBigramProb && words.size() > 2) {
550                     final int word0Index = random.nextInt(words.size());
551                     int word1Index = random.nextInt(words.size() - 1);
552                     if (word0Index <= word1Index) {
553                         word1Index++;
554                     }
555                     final String word0 = words.get(word0Index);
556                     final String word1 = words.get(word1Index);
557                     if (TextUtils.equals(word0, word1)) {
558                         continue;
559                     }
560                     final int unigramProbability = unigramProbabilities.get(word1);
561                     final int bigramProbability =
562                             unigramProbability + random.nextInt(0xFF - unigramProbability);
563                     final Pair<String, String> bigram = new Pair<>(word0, word1);
564                     bigramWords.add(bigram);
565                     bigramProbabilities.put(bigram, bigramProbability);
566                     addBigramWords(binaryDictionary, word0, word1, bigramProbability);
567                 }
568             }
569 
570             // Test whether the all unigram operations are collectlly handled.
571             for (int i = 0; i < words.size(); i++) {
572                 final String word = words.get(i);
573                 final int unigramProbability = unigramProbabilities.get(word);
574                 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
575             }
576             // Test whether the all bigram operations are collectlly handled.
577             for (int i = 0; i < bigramWords.size(); i++) {
578                 final Pair<String, String> bigram = bigramWords.get(i);
579                 final int probability;
580                 if (bigramProbabilities.containsKey(bigram)) {
581                     probability = bigramProbabilities.get(bigram);
582                 } else {
583                     probability = Dictionary.NOT_A_PROBABILITY;
584                 }
585 
586                 assertEquals(probability,
587                         getBigramProbability(binaryDictionary, bigram.first, bigram.second));
588                 assertEquals(probability != Dictionary.NOT_A_PROBABILITY,
589                         isValidBigram(binaryDictionary, bigram.first, bigram.second));
590             }
591             binaryDictionary.flushWithGC();
592             binaryDictionary.close();
593         }
594     }
595 
testAddManyUnigramsAndFlushWithGC()596     public void testAddManyUnigramsAndFlushWithGC() {
597         final int flashWithGCIterationCount = 3;
598         final int codePointSetSize = 50;
599 
600         final long seed = System.currentTimeMillis();
601         final Random random = new Random(seed);
602 
603         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
604 
605         final ArrayList<String> words = new ArrayList<>();
606         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
607         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
608 
609         BinaryDictionary binaryDictionary;
610         for (int i = 0; i < flashWithGCIterationCount; i++) {
611             binaryDictionary = getBinaryDictionary(dictFile);
612             while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
613                 final String word = CodePointUtils.generateWord(random, codePointSet);
614                 words.add(word);
615                 final int unigramProbability = random.nextInt(0xFF);
616                 unigramProbabilities.put(word, unigramProbability);
617                 addUnigramWord(binaryDictionary, word, unigramProbability);
618             }
619 
620             for (int j = 0; j < words.size(); j++) {
621                 final String word = words.get(j);
622                 final int unigramProbability = unigramProbabilities.get(word);
623                 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
624             }
625 
626             binaryDictionary.flushWithGC();
627             binaryDictionary.close();
628         }
629     }
630 
testUnigramAndBigramCount()631     public void testUnigramAndBigramCount() {
632         final int maxUnigramCount = 5000;
633         final int maxBigramCount = 10000;
634         final HashMap<String, String> attributeMap = new HashMap<>();
635         attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
636         attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
637 
638         final int flashWithGCIterationCount = 10;
639         final int codePointSetSize = 50;
640         final int unigramCountPerIteration = 1000;
641         final int bigramCountPerIteration = 2000;
642         final long seed = System.currentTimeMillis();
643         final Random random = new Random(seed);
644         final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
645                 attributeMap);
646 
647         final ArrayList<String> words = new ArrayList<>();
648         final HashSet<Pair<String, String>> bigrams = new HashSet<>();
649         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
650 
651         BinaryDictionary binaryDictionary;
652         for (int i = 0; i < flashWithGCIterationCount; i++) {
653             binaryDictionary = getBinaryDictionary(dictFile);
654             for (int j = 0; j < unigramCountPerIteration; j++) {
655                 final String word = CodePointUtils.generateWord(random, codePointSet);
656                 words.add(word);
657                 final int unigramProbability = random.nextInt(0xFF);
658                 addUnigramWord(binaryDictionary, word, unigramProbability);
659             }
660             for (int j = 0; j < bigramCountPerIteration; j++) {
661                 final String word0 = words.get(random.nextInt(words.size()));
662                 final String word1 = words.get(random.nextInt(words.size()));
663                 if (TextUtils.equals(word0, word1)) {
664                     continue;
665                 }
666                 bigrams.add(new Pair<>(word0, word1));
667                 final int bigramProbability = random.nextInt(0xF);
668                 addBigramWords(binaryDictionary, word0, word1, bigramProbability);
669             }
670             assertEquals(new HashSet<>(words).size(), Integer.parseInt(
671                     binaryDictionary.getPropertyForGettingStats(
672                             BinaryDictionary.UNIGRAM_COUNT_QUERY)));
673             assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
674                     binaryDictionary.getPropertyForGettingStats(
675                             BinaryDictionary.BIGRAM_COUNT_QUERY)));
676             binaryDictionary.flushWithGC();
677             assertEquals(new HashSet<>(words).size(), Integer.parseInt(
678                     binaryDictionary.getPropertyForGettingStats(
679                             BinaryDictionary.UNIGRAM_COUNT_QUERY)));
680             assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
681                     binaryDictionary.getPropertyForGettingStats(
682                             BinaryDictionary.BIGRAM_COUNT_QUERY)));
683             binaryDictionary.close();
684         }
685     }
686 
testGetWordProperties()687     public void testGetWordProperties() {
688         final long seed = System.currentTimeMillis();
689         final Random random = new Random(seed);
690         final int UNIGRAM_COUNT = 1000;
691         final int BIGRAM_COUNT = 1000;
692         final int codePointSetSize = 20;
693         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
694         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
695         final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
696 
697         final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
698                 false /* isBeginningOfSentence */);
699         assertFalse(invalidWordProperty.isValid());
700 
701         final ArrayList<String> words = new ArrayList<>();
702         final HashMap<String, Integer> wordProbabilities = new HashMap<>();
703         final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
704         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
705 
706         for (int i = 0; i < UNIGRAM_COUNT; i++) {
707             final String word = CodePointUtils.generateWord(random, codePointSet);
708             final int unigramProbability = random.nextInt(0xFF);
709             final boolean isNotAWord = random.nextBoolean();
710             final boolean isPossiblyOffensive = random.nextBoolean();
711             // TODO: Add tests for historical info.
712             binaryDictionary.addUnigramEntry(word, unigramProbability,
713                     false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive,
714                     BinaryDictionary.NOT_A_VALID_TIMESTAMP);
715             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
716                 binaryDictionary.flushWithGC();
717             }
718             words.add(word);
719             wordProbabilities.put(word, unigramProbability);
720             final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
721                     false /* isBeginningOfSentence */);
722             assertEquals(word, wordProperty.mWord);
723             assertTrue(wordProperty.isValid());
724             assertEquals(isNotAWord, wordProperty.mIsNotAWord);
725             assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive);
726             assertEquals(false, wordProperty.mHasNgrams);
727             assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
728         }
729 
730         for (int i = 0; i < BIGRAM_COUNT; i++) {
731             final int word0Index = random.nextInt(wordProbabilities.size());
732             final int word1Index = random.nextInt(wordProbabilities.size());
733             if (word0Index == word1Index) {
734                 continue;
735             }
736             final String word0 = words.get(word0Index);
737             final String word1 = words.get(word1Index);
738             final int unigramProbability = wordProbabilities.get(word1);
739             final int bigramProbability =
740                     unigramProbability + random.nextInt(0xFF - unigramProbability);
741             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
742             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
743                 binaryDictionary.flushWithGC();
744             }
745             if (!bigrams.containsKey(word0)) {
746                 final HashSet<String> bigramWord1s = new HashSet<>();
747                 bigrams.put(word0, bigramWord1s);
748             }
749             bigrams.get(word0).add(word1);
750             bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability);
751         }
752 
753         for (int i = 0; i < words.size(); i++) {
754             final String word0 = words.get(i);
755             if (!bigrams.containsKey(word0)) {
756                 continue;
757             }
758             final HashSet<String> bigramWord1s = bigrams.get(word0);
759             final WordProperty wordProperty = binaryDictionary.getWordProperty(word0,
760                     false /* isBeginningOfSentence */);
761             assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size());
762             // TODO: Support ngram.
763             for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
764                 final String word1 = bigramTarget.mWord;
765                 assertTrue(bigramWord1s.contains(word1));
766                 final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1));
767                 assertEquals(bigramProbability, bigramTarget.getProbability());
768             }
769         }
770     }
771 
testIterateAllWords()772     public void testIterateAllWords() {
773         final long seed = System.currentTimeMillis();
774         final Random random = new Random(seed);
775         final int UNIGRAM_COUNT = 1000;
776         final int BIGRAM_COUNT = 1000;
777         final int codePointSetSize = 20;
778         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
779         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
780 
781         final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
782                 false /* isBeginningOfSentence */);
783         assertFalse(invalidWordProperty.isValid());
784 
785         final ArrayList<String> words = new ArrayList<>();
786         final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>();
787         final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
788         final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
789                 new HashMap<>();
790 
791         for (int i = 0; i < UNIGRAM_COUNT; i++) {
792             final String word = CodePointUtils.generateWord(random, codePointSet);
793             final int unigramProbability = random.nextInt(0xFF);
794             addUnigramWord(binaryDictionary, word, unigramProbability);
795             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
796                 binaryDictionary.flushWithGC();
797             }
798             words.add(word);
799             wordProbabilitiesToCheckLater.put(word, unigramProbability);
800         }
801 
802         for (int i = 0; i < BIGRAM_COUNT; i++) {
803             final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
804             final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
805             if (word0Index == word1Index) {
806                 continue;
807             }
808             final String word0 = words.get(word0Index);
809             final String word1 = words.get(word1Index);
810             final int unigramProbability = wordProbabilitiesToCheckLater.get(word1);
811             final int bigramProbability =
812                     unigramProbability + random.nextInt(0xFF - unigramProbability);
813             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
814             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
815                 binaryDictionary.flushWithGC();
816             }
817             if (!bigrams.containsKey(word0)) {
818                 final HashSet<String> bigramWord1s = new HashSet<>();
819                 bigrams.put(word0, bigramWord1s);
820             }
821             bigrams.get(word0).add(word1);
822             bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability);
823         }
824 
825         final HashSet<String> wordSet = new HashSet<>(words);
826         final HashSet<Pair<String, String>> bigramSet =
827                 new HashSet<>(bigramProbabilitiesToCheckLater.keySet());
828         int token = 0;
829         do {
830             final BinaryDictionary.GetNextWordPropertyResult result =
831                     binaryDictionary.getNextWordProperty(token);
832             final WordProperty wordProperty = result.mWordProperty;
833             final String word0 = wordProperty.mWord;
834             assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
835                     wordProperty.mProbabilityInfo.mProbability);
836             wordSet.remove(word0);
837             final HashSet<String> bigramWord1s = bigrams.get(word0);
838             // TODO: Support ngram.
839             if (wordProperty.mHasNgrams) {
840                 for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
841                     final String word1 = bigramTarget.mWord;
842                     assertTrue(bigramWord1s.contains(word1));
843                     final Pair<String, String> bigram = new Pair<>(word0, word1);
844                     final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram);
845                     assertEquals(bigramProbability, bigramTarget.getProbability());
846                     bigramSet.remove(bigram);
847                 }
848             }
849             token = result.mNextToken;
850         } while (token != 0);
851         assertTrue(wordSet.isEmpty());
852         assertTrue(bigramSet.isEmpty());
853     }
854 
testPossiblyOffensiveAttributeMaintained()855     public void testPossiblyOffensiveAttributeMaintained() {
856         final BinaryDictionary binaryDictionary =
857                 getEmptyBinaryDictionary(FormatSpec.VERSION403);
858         binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0);
859         WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false);
860         assertEquals(true, wordProperty.mIsPossiblyOffensive);
861     }
862 
testBeginningOfSentence()863     public void testBeginningOfSentence() {
864         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
865         final int dummyProbability = 0;
866         final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
867         final int bigramProbability = 200;
868         addUnigramWord(binaryDictionary, "aaa", dummyProbability);
869         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
870                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
871         assertEquals(bigramProbability,
872                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
873         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
874                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
875         addUnigramWord(binaryDictionary, "bbb", dummyProbability);
876         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability,
877                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
878         binaryDictionary.flushWithGC();
879         assertEquals(bigramProbability,
880                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
881         assertEquals(bigramProbability,
882                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb"));
883     }
884 }
885