1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.test.AndroidTestCase;
20import android.test.suitebuilder.annotation.LargeTest;
21import android.text.TextUtils;
22import android.util.Pair;
23
24import com.android.inputmethod.latin.NgramContext.WordInfo;
25import com.android.inputmethod.latin.common.CodePointUtils;
26import com.android.inputmethod.latin.common.FileUtils;
27import com.android.inputmethod.latin.makedict.DictionaryHeader;
28import com.android.inputmethod.latin.makedict.FormatSpec;
29import com.android.inputmethod.latin.makedict.WeightedString;
30import com.android.inputmethod.latin.makedict.WordProperty;
31import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
32
33import java.io.File;
34import java.io.IOException;
35import java.util.ArrayList;
36import java.util.HashMap;
37import java.util.HashSet;
38import java.util.Locale;
39import java.util.Random;
40
41@LargeTest
42public class BinaryDictionaryTests extends AndroidTestCase {
43    private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
44    private static final String TEST_LOCALE = "test";
45    private static final String DICTIONARY_ID = "TestBinaryDictionary";
46
47    private HashSet<File> mDictFilesToBeDeleted = new HashSet<>();
48
49    @Override
50    protected void setUp() throws Exception {
51        super.setUp();
52        mDictFilesToBeDeleted.clear();
53    }
54
55    @Override
56    protected void tearDown() throws Exception {
57        for (final File dictFile : mDictFilesToBeDeleted) {
58            dictFile.delete();
59        }
60        mDictFilesToBeDeleted.clear();
61        super.tearDown();
62    }
63
64    private File createEmptyDictionaryAndGetFile(final int formatVersion) {
65        return createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
66                new HashMap<String, String>());
67    }
68
69    private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion,
70            final HashMap<String, String> attributeMap) {
71        try {
72            final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion,
73                    attributeMap);
74            mDictFilesToBeDeleted.add(dictFile);
75            return dictFile;
76        } catch (final IOException e) {
77            fail(e.toString());
78        }
79        return null;
80    }
81
82    private File createEmptyVer4DictionaryAndGetFile(final int formatVersion,
83            final HashMap<String, String> attributeMap) throws IOException {
84        final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION,
85                getContext().getCacheDir());
86        file.delete();
87        file.mkdir();
88        if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion,
89                Locale.ENGLISH, attributeMap)) {
90            return file;
91        }
92        throw new IOException("Empty dictionary " + file.getAbsolutePath()
93                + " cannot be created. Format version: " + formatVersion);
94    }
95
96    private static BinaryDictionary getBinaryDictionary(final File dictFile) {
97        return new BinaryDictionary(dictFile.getAbsolutePath(),
98                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
99                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
100    }
101
102    private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) {
103        final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
104        return new BinaryDictionary(dictFile.getAbsolutePath(),
105                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
106                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
107    }
108
109    public void testIsValidDictionary() {
110        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
111        BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
112        assertTrue("binaryDictionary must be valid for existing valid dictionary file.",
113                binaryDictionary.isValidDictionary());
114        binaryDictionary.close();
115        assertFalse("binaryDictionary must be invalid after closing.",
116                binaryDictionary.isValidDictionary());
117        FileUtils.deleteRecursively(dictFile);
118        binaryDictionary = getBinaryDictionary(dictFile);
119        assertFalse("binaryDictionary must be invalid for not existing dictionary file.",
120                binaryDictionary.isValidDictionary());
121        binaryDictionary.close();
122    }
123
124    public void testConstructingDictionaryOnMemory() {
125        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
126        FileUtils.deleteRecursively(dictFile);
127        assertFalse(dictFile.exists());
128        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
129                true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE,
130                FormatSpec.VERSION403, new HashMap<String, String>());
131        assertTrue(binaryDictionary.isValidDictionary());
132        assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
133        final int probability = 100;
134        addUnigramWord(binaryDictionary, "word", probability);
135        assertEquals(probability, binaryDictionary.getFrequency("word"));
136        assertFalse(dictFile.exists());
137        binaryDictionary.flush();
138        assertTrue(dictFile.exists());
139        assertTrue(binaryDictionary.isValidDictionary());
140        assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
141        assertEquals(probability, binaryDictionary.getFrequency("word"));
142        binaryDictionary.close();
143    }
144
145    public void testAddTooLongWord() {
146        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
147        final StringBuffer stringBuilder = new StringBuffer();
148        for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) {
149            stringBuilder.append('a');
150        }
151        final String validLongWord = stringBuilder.toString();
152        stringBuilder.append('a');
153        final String invalidLongWord = stringBuilder.toString();
154        final int probability = 100;
155        addUnigramWord(binaryDictionary, "aaa", probability);
156        addUnigramWord(binaryDictionary, validLongWord, probability);
157        addUnigramWord(binaryDictionary, invalidLongWord, probability);
158        // Too long short cut.
159        binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */,
160                false /* isNotAWord */, false /* isPossiblyOffensive */,
161                BinaryDictionary.NOT_A_VALID_TIMESTAMP);
162        addUnigramWord(binaryDictionary, "abc", probability);
163        final int updatedProbability = 200;
164        // Update.
165        addUnigramWord(binaryDictionary, validLongWord, updatedProbability);
166        addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability);
167        addUnigramWord(binaryDictionary, "abc", updatedProbability);
168
169        assertEquals(probability, binaryDictionary.getFrequency("aaa"));
170        assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord));
171        assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord));
172        assertEquals(updatedProbability, binaryDictionary.getFrequency("abc"));
173    }
174
175    private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
176            final int probability) {
177        binaryDictionary.addUnigramEntry(word, probability,
178                false /* isBeginningOfSentence */, false /* isNotAWord */,
179                false /* isPossiblyOffensive */,
180                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
181    }
182
183    private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
184            final String word1, final int probability) {
185        binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability,
186                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
187    }
188
189    private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0,
190            final String word1, final String word2, final int probability) {
191        binaryDictionary.addNgramEntry(
192                new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2,
193                probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
194    }
195
196    private static boolean isValidBigram(final BinaryDictionary binaryDictionary,
197            final String word0, final String word1) {
198        return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1);
199    }
200
201    private static int getBigramProbability(final BinaryDictionary binaryDictionary,
202            final String word0,  final String word1) {
203        return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1);
204    }
205
206    private static int getTrigramProbability(final BinaryDictionary binaryDictionary,
207            final String word0, final String word1, final String word2) {
208        return binaryDictionary.getNgramProbability(
209                new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2);
210    }
211
212    public void testAddUnigramWord() {
213        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
214        final int probability = 100;
215        addUnigramWord(binaryDictionary, "aaa", probability);
216        // Reallocate and create.
217        addUnigramWord(binaryDictionary, "aab", probability);
218        // Insert into children.
219        addUnigramWord(binaryDictionary, "aac", probability);
220        // Make terminal.
221        addUnigramWord(binaryDictionary, "aa", probability);
222        // Create children.
223        addUnigramWord(binaryDictionary, "aaaa", probability);
224        // Reallocate and make termianl.
225        addUnigramWord(binaryDictionary, "a", probability);
226
227        final int updatedProbability = 200;
228        // Update.
229        addUnigramWord(binaryDictionary, "aaa", updatedProbability);
230
231        assertEquals(probability, binaryDictionary.getFrequency("aab"));
232        assertEquals(probability, binaryDictionary.getFrequency("aac"));
233        assertEquals(probability, binaryDictionary.getFrequency("aa"));
234        assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
235        assertEquals(probability, binaryDictionary.getFrequency("a"));
236        assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
237    }
238
239    public void testRandomlyAddUnigramWord() {
240        final int wordCount = 1000;
241        final int codePointSetSize = 50;
242        final long seed = System.currentTimeMillis();
243        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
244
245        final HashMap<String, Integer> probabilityMap = new HashMap<>();
246        // Test a word that isn't contained within the dictionary.
247        final Random random = new Random(seed);
248        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
249        for (int i = 0; i < wordCount; ++i) {
250            final String word = CodePointUtils.generateWord(random, codePointSet);
251            probabilityMap.put(word, random.nextInt(0xFF));
252        }
253        for (String word : probabilityMap.keySet()) {
254            addUnigramWord(binaryDictionary, word, probabilityMap.get(word));
255        }
256        for (String word : probabilityMap.keySet()) {
257            assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
258        }
259    }
260
261    public void testAddBigramWords() {
262        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
263
264        final int unigramProbability = 100;
265        final int bigramProbability = 150;
266        final int updatedBigramProbability = 200;
267        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
268        addUnigramWord(binaryDictionary, "abb", unigramProbability);
269        addUnigramWord(binaryDictionary, "bcc", unigramProbability);
270        addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
271        addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
272        addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
273        addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
274
275        assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
276        assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc"));
277        assertTrue(isValidBigram(binaryDictionary, "abb", "aaa"));
278        assertTrue(isValidBigram(binaryDictionary, "abb", "bcc"));
279        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
280        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
281        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
282        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
283
284        addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability);
285        assertEquals(updatedBigramProbability,
286                getBigramProbability(binaryDictionary, "aaa", "abb"));
287
288        assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
289        assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
290        assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
291        assertEquals(Dictionary.NOT_A_PROBABILITY,
292                getBigramProbability(binaryDictionary, "bcc", "aaa"));
293        assertEquals(Dictionary.NOT_A_PROBABILITY,
294                getBigramProbability(binaryDictionary, "bcc", "bbc"));
295        assertEquals(Dictionary.NOT_A_PROBABILITY,
296                getBigramProbability(binaryDictionary, "aaa", "aaa"));
297
298        // Testing bigram link.
299        addUnigramWord(binaryDictionary, "abcde", unigramProbability);
300        addUnigramWord(binaryDictionary, "fghij", unigramProbability);
301        addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability);
302        addUnigramWord(binaryDictionary, "fgh", unigramProbability);
303        addUnigramWord(binaryDictionary, "abc", unigramProbability);
304        addUnigramWord(binaryDictionary, "f", unigramProbability);
305
306        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij"));
307        assertEquals(Dictionary.NOT_A_PROBABILITY,
308                getBigramProbability(binaryDictionary, "abcde", "fgh"));
309        addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability);
310        assertEquals(updatedBigramProbability,
311                getBigramProbability(binaryDictionary, "abcde", "fghij"));
312    }
313
314    public void testRandomlyAddBigramWords() {
315        final int wordCount = 100;
316        final int bigramCount = 1000;
317        final int codePointSetSize = 50;
318        final long seed = System.currentTimeMillis();
319        final Random random = new Random(seed);
320        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
321
322        final ArrayList<String> words = new ArrayList<>();
323        final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
324        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
325        final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
326        final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
327
328        for (int i = 0; i < wordCount; ++i) {
329            final String word = CodePointUtils.generateWord(random, codePointSet);
330            words.add(word);
331            final int unigramProbability = random.nextInt(0xFF);
332            unigramProbabilities.put(word, unigramProbability);
333            addUnigramWord(binaryDictionary, word, unigramProbability);
334        }
335
336        for (int i = 0; i < bigramCount; i++) {
337            final String word0 = words.get(random.nextInt(wordCount));
338            final String word1 = words.get(random.nextInt(wordCount));
339            if (TextUtils.equals(word0, word1)) {
340                continue;
341            }
342            final Pair<String, String> bigram = new Pair<>(word0, word1);
343            bigramWords.add(bigram);
344            final int unigramProbability = unigramProbabilities.get(word1);
345            final int bigramProbability =
346                    unigramProbability + random.nextInt(0xFF - unigramProbability);
347            bigramProbabilities.put(bigram, bigramProbability);
348            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
349        }
350
351        for (final Pair<String, String> bigram : bigramWords) {
352            final int bigramProbability = bigramProbabilities.get(bigram);
353            assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
354                    isValidBigram(binaryDictionary, bigram.first, bigram.second));
355            assertEquals(bigramProbability,
356                    getBigramProbability(binaryDictionary, bigram.first, bigram.second));
357        }
358    }
359
360    public void testAddTrigramWords() {
361        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
362        final int unigramProbability = 100;
363        final int trigramProbability = 150;
364        final int updatedTrigramProbability = 200;
365        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
366        addUnigramWord(binaryDictionary, "abb", unigramProbability);
367        addUnigramWord(binaryDictionary, "bcc", unigramProbability);
368
369        addBigramWords(binaryDictionary, "abb", "bcc", 10);
370        addBigramWords(binaryDictionary, "abb", "aaa", 10);
371
372        addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability);
373        addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability);
374
375        assertEquals(trigramProbability,
376                getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc"));
377        assertEquals(trigramProbability,
378                getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
379        assertFalse(isValidBigram(binaryDictionary, "aaa", "abb"));
380
381        addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability);
382        assertEquals(updatedTrigramProbability,
383                getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
384    }
385
386    public void testFlushDictionary() {
387        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
388        BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
389
390        final int probability = 100;
391        addUnigramWord(binaryDictionary, "aaa", probability);
392        addUnigramWord(binaryDictionary, "abcd", probability);
393        // Close without flushing.
394        binaryDictionary.close();
395
396        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
397                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
398                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
399
400        assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa"));
401        assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd"));
402
403        addUnigramWord(binaryDictionary, "aaa", probability);
404        addUnigramWord(binaryDictionary, "abcd", probability);
405        binaryDictionary.flush();
406        binaryDictionary.close();
407
408        binaryDictionary = getBinaryDictionary(dictFile);
409        assertEquals(probability, binaryDictionary.getFrequency("aaa"));
410        assertEquals(probability, binaryDictionary.getFrequency("abcd"));
411        addUnigramWord(binaryDictionary, "bcde", probability);
412        binaryDictionary.flush();
413        binaryDictionary.close();
414
415        binaryDictionary = getBinaryDictionary(dictFile);
416        assertEquals(probability, binaryDictionary.getFrequency("bcde"));
417        binaryDictionary.close();
418    }
419
420    public void testFlushWithGCDictionary() {
421        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
422        BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
423        final int unigramProbability = 100;
424        final int bigramProbability = 150;
425        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
426        addUnigramWord(binaryDictionary, "abb", unigramProbability);
427        addUnigramWord(binaryDictionary, "bcc", unigramProbability);
428        addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
429        addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
430        addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
431        addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
432        binaryDictionary.flushWithGC();
433        binaryDictionary.close();
434
435        binaryDictionary = getBinaryDictionary(dictFile);
436        assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
437        assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
438        assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
439        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
440        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
441        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
442        assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
443        assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
444        assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
445        assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
446        binaryDictionary.flushWithGC();
447        binaryDictionary.close();
448    }
449
450    public void testAddBigramWordsAndFlashWithGC() {
451        final int wordCount = 100;
452        final int bigramCount = 1000;
453        final int codePointSetSize = 30;
454        final long seed = System.currentTimeMillis();
455        final Random random = new Random(seed);
456
457        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
458        BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
459
460        final ArrayList<String> words = new ArrayList<>();
461        final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
462        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
463        final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
464        final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
465
466        for (int i = 0; i < wordCount; ++i) {
467            final String word = CodePointUtils.generateWord(random, codePointSet);
468            words.add(word);
469            final int unigramProbability = random.nextInt(0xFF);
470            unigramProbabilities.put(word, unigramProbability);
471            addUnigramWord(binaryDictionary, word, unigramProbability);
472        }
473
474        for (int i = 0; i < bigramCount; i++) {
475            final String word0 = words.get(random.nextInt(wordCount));
476            final String word1 = words.get(random.nextInt(wordCount));
477            if (TextUtils.equals(word0, word1)) {
478                continue;
479            }
480            final Pair<String, String> bigram = new Pair<>(word0, word1);
481            bigramWords.add(bigram);
482            final int unigramProbability = unigramProbabilities.get(word1);
483            final int bigramProbability =
484                    unigramProbability + random.nextInt(0xFF - unigramProbability);
485            bigramProbabilities.put(bigram, bigramProbability);
486            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
487        }
488
489        binaryDictionary.flushWithGC();
490        binaryDictionary.close();
491        binaryDictionary = getBinaryDictionary(dictFile);
492
493        for (final Pair<String, String> bigram : bigramWords) {
494            final int bigramProbability = bigramProbabilities.get(bigram);
495            assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
496                    isValidBigram(binaryDictionary, bigram.first, bigram.second));
497            assertEquals(bigramProbability,
498                    getBigramProbability(binaryDictionary, bigram.first, bigram.second));
499        }
500    }
501
502    public void testRandomOperationsAndFlashWithGC() {
503        final int maxUnigramCount = 5000;
504        final int maxBigramCount = 10000;
505        final HashMap<String, String> attributeMap = new HashMap<>();
506        attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
507        attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
508
509        final int flashWithGCIterationCount = 50;
510        final int operationCountInEachIteration = 200;
511        final int initialUnigramCount = 100;
512        final float addUnigramProb = 0.5f;
513        final float addBigramProb = 0.8f;
514        final int codePointSetSize = 30;
515
516        final long seed = System.currentTimeMillis();
517        final Random random = new Random(seed);
518        final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
519                attributeMap);
520        BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
521
522        final ArrayList<String> words = new ArrayList<>();
523        final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
524        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
525        final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
526        final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
527        for (int i = 0; i < initialUnigramCount; ++i) {
528            final String word = CodePointUtils.generateWord(random, codePointSet);
529            words.add(word);
530            final int unigramProbability = random.nextInt(0xFF);
531            unigramProbabilities.put(word, unigramProbability);
532            addUnigramWord(binaryDictionary, word, unigramProbability);
533        }
534        binaryDictionary.flushWithGC();
535        binaryDictionary.close();
536
537        for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) {
538            binaryDictionary = getBinaryDictionary(dictFile);
539            for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) {
540                // Add unigram.
541                if (random.nextFloat() < addUnigramProb) {
542                    final String word = CodePointUtils.generateWord(random, codePointSet);
543                    words.add(word);
544                    final int unigramProbability = random.nextInt(0xFF);
545                    unigramProbabilities.put(word, unigramProbability);
546                    addUnigramWord(binaryDictionary, word, unigramProbability);
547                }
548                // Add bigram.
549                if (random.nextFloat() < addBigramProb && words.size() > 2) {
550                    final int word0Index = random.nextInt(words.size());
551                    int word1Index = random.nextInt(words.size() - 1);
552                    if (word0Index <= word1Index) {
553                        word1Index++;
554                    }
555                    final String word0 = words.get(word0Index);
556                    final String word1 = words.get(word1Index);
557                    if (TextUtils.equals(word0, word1)) {
558                        continue;
559                    }
560                    final int unigramProbability = unigramProbabilities.get(word1);
561                    final int bigramProbability =
562                            unigramProbability + random.nextInt(0xFF - unigramProbability);
563                    final Pair<String, String> bigram = new Pair<>(word0, word1);
564                    bigramWords.add(bigram);
565                    bigramProbabilities.put(bigram, bigramProbability);
566                    addBigramWords(binaryDictionary, word0, word1, bigramProbability);
567                }
568            }
569
570            // Test whether the all unigram operations are collectlly handled.
571            for (int i = 0; i < words.size(); i++) {
572                final String word = words.get(i);
573                final int unigramProbability = unigramProbabilities.get(word);
574                assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
575            }
576            // Test whether the all bigram operations are collectlly handled.
577            for (int i = 0; i < bigramWords.size(); i++) {
578                final Pair<String, String> bigram = bigramWords.get(i);
579                final int probability;
580                if (bigramProbabilities.containsKey(bigram)) {
581                    probability = bigramProbabilities.get(bigram);
582                } else {
583                    probability = Dictionary.NOT_A_PROBABILITY;
584                }
585
586                assertEquals(probability,
587                        getBigramProbability(binaryDictionary, bigram.first, bigram.second));
588                assertEquals(probability != Dictionary.NOT_A_PROBABILITY,
589                        isValidBigram(binaryDictionary, bigram.first, bigram.second));
590            }
591            binaryDictionary.flushWithGC();
592            binaryDictionary.close();
593        }
594    }
595
596    public void testAddManyUnigramsAndFlushWithGC() {
597        final int flashWithGCIterationCount = 3;
598        final int codePointSetSize = 50;
599
600        final long seed = System.currentTimeMillis();
601        final Random random = new Random(seed);
602
603        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
604
605        final ArrayList<String> words = new ArrayList<>();
606        final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
607        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
608
609        BinaryDictionary binaryDictionary;
610        for (int i = 0; i < flashWithGCIterationCount; i++) {
611            binaryDictionary = getBinaryDictionary(dictFile);
612            while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
613                final String word = CodePointUtils.generateWord(random, codePointSet);
614                words.add(word);
615                final int unigramProbability = random.nextInt(0xFF);
616                unigramProbabilities.put(word, unigramProbability);
617                addUnigramWord(binaryDictionary, word, unigramProbability);
618            }
619
620            for (int j = 0; j < words.size(); j++) {
621                final String word = words.get(j);
622                final int unigramProbability = unigramProbabilities.get(word);
623                assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
624            }
625
626            binaryDictionary.flushWithGC();
627            binaryDictionary.close();
628        }
629    }
630
631    public void testUnigramAndBigramCount() {
632        final int maxUnigramCount = 5000;
633        final int maxBigramCount = 10000;
634        final HashMap<String, String> attributeMap = new HashMap<>();
635        attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
636        attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
637
638        final int flashWithGCIterationCount = 10;
639        final int codePointSetSize = 50;
640        final int unigramCountPerIteration = 1000;
641        final int bigramCountPerIteration = 2000;
642        final long seed = System.currentTimeMillis();
643        final Random random = new Random(seed);
644        final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
645                attributeMap);
646
647        final ArrayList<String> words = new ArrayList<>();
648        final HashSet<Pair<String, String>> bigrams = new HashSet<>();
649        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
650
651        BinaryDictionary binaryDictionary;
652        for (int i = 0; i < flashWithGCIterationCount; i++) {
653            binaryDictionary = getBinaryDictionary(dictFile);
654            for (int j = 0; j < unigramCountPerIteration; j++) {
655                final String word = CodePointUtils.generateWord(random, codePointSet);
656                words.add(word);
657                final int unigramProbability = random.nextInt(0xFF);
658                addUnigramWord(binaryDictionary, word, unigramProbability);
659            }
660            for (int j = 0; j < bigramCountPerIteration; j++) {
661                final String word0 = words.get(random.nextInt(words.size()));
662                final String word1 = words.get(random.nextInt(words.size()));
663                if (TextUtils.equals(word0, word1)) {
664                    continue;
665                }
666                bigrams.add(new Pair<>(word0, word1));
667                final int bigramProbability = random.nextInt(0xF);
668                addBigramWords(binaryDictionary, word0, word1, bigramProbability);
669            }
670            assertEquals(new HashSet<>(words).size(), Integer.parseInt(
671                    binaryDictionary.getPropertyForGettingStats(
672                            BinaryDictionary.UNIGRAM_COUNT_QUERY)));
673            assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
674                    binaryDictionary.getPropertyForGettingStats(
675                            BinaryDictionary.BIGRAM_COUNT_QUERY)));
676            binaryDictionary.flushWithGC();
677            assertEquals(new HashSet<>(words).size(), Integer.parseInt(
678                    binaryDictionary.getPropertyForGettingStats(
679                            BinaryDictionary.UNIGRAM_COUNT_QUERY)));
680            assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
681                    binaryDictionary.getPropertyForGettingStats(
682                            BinaryDictionary.BIGRAM_COUNT_QUERY)));
683            binaryDictionary.close();
684        }
685    }
686
687    public void testGetWordProperties() {
688        final long seed = System.currentTimeMillis();
689        final Random random = new Random(seed);
690        final int UNIGRAM_COUNT = 1000;
691        final int BIGRAM_COUNT = 1000;
692        final int codePointSetSize = 20;
693        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
694        final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
695        final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
696
697        final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
698                false /* isBeginningOfSentence */);
699        assertFalse(invalidWordProperty.isValid());
700
701        final ArrayList<String> words = new ArrayList<>();
702        final HashMap<String, Integer> wordProbabilities = new HashMap<>();
703        final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
704        final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
705
706        for (int i = 0; i < UNIGRAM_COUNT; i++) {
707            final String word = CodePointUtils.generateWord(random, codePointSet);
708            final int unigramProbability = random.nextInt(0xFF);
709            final boolean isNotAWord = random.nextBoolean();
710            final boolean isPossiblyOffensive = random.nextBoolean();
711            // TODO: Add tests for historical info.
712            binaryDictionary.addUnigramEntry(word, unigramProbability,
713                    false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive,
714                    BinaryDictionary.NOT_A_VALID_TIMESTAMP);
715            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
716                binaryDictionary.flushWithGC();
717            }
718            words.add(word);
719            wordProbabilities.put(word, unigramProbability);
720            final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
721                    false /* isBeginningOfSentence */);
722            assertEquals(word, wordProperty.mWord);
723            assertTrue(wordProperty.isValid());
724            assertEquals(isNotAWord, wordProperty.mIsNotAWord);
725            assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive);
726            assertEquals(false, wordProperty.mHasNgrams);
727            assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
728        }
729
730        for (int i = 0; i < BIGRAM_COUNT; i++) {
731            final int word0Index = random.nextInt(wordProbabilities.size());
732            final int word1Index = random.nextInt(wordProbabilities.size());
733            if (word0Index == word1Index) {
734                continue;
735            }
736            final String word0 = words.get(word0Index);
737            final String word1 = words.get(word1Index);
738            final int unigramProbability = wordProbabilities.get(word1);
739            final int bigramProbability =
740                    unigramProbability + random.nextInt(0xFF - unigramProbability);
741            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
742            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
743                binaryDictionary.flushWithGC();
744            }
745            if (!bigrams.containsKey(word0)) {
746                final HashSet<String> bigramWord1s = new HashSet<>();
747                bigrams.put(word0, bigramWord1s);
748            }
749            bigrams.get(word0).add(word1);
750            bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability);
751        }
752
753        for (int i = 0; i < words.size(); i++) {
754            final String word0 = words.get(i);
755            if (!bigrams.containsKey(word0)) {
756                continue;
757            }
758            final HashSet<String> bigramWord1s = bigrams.get(word0);
759            final WordProperty wordProperty = binaryDictionary.getWordProperty(word0,
760                    false /* isBeginningOfSentence */);
761            assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size());
762            // TODO: Support ngram.
763            for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
764                final String word1 = bigramTarget.mWord;
765                assertTrue(bigramWord1s.contains(word1));
766                final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1));
767                assertEquals(bigramProbability, bigramTarget.getProbability());
768            }
769        }
770    }
771
772    public void testIterateAllWords() {
773        final long seed = System.currentTimeMillis();
774        final Random random = new Random(seed);
775        final int UNIGRAM_COUNT = 1000;
776        final int BIGRAM_COUNT = 1000;
777        final int codePointSetSize = 20;
778        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
779        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
780
781        final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
782                false /* isBeginningOfSentence */);
783        assertFalse(invalidWordProperty.isValid());
784
785        final ArrayList<String> words = new ArrayList<>();
786        final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>();
787        final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
788        final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
789                new HashMap<>();
790
791        for (int i = 0; i < UNIGRAM_COUNT; i++) {
792            final String word = CodePointUtils.generateWord(random, codePointSet);
793            final int unigramProbability = random.nextInt(0xFF);
794            addUnigramWord(binaryDictionary, word, unigramProbability);
795            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
796                binaryDictionary.flushWithGC();
797            }
798            words.add(word);
799            wordProbabilitiesToCheckLater.put(word, unigramProbability);
800        }
801
802        for (int i = 0; i < BIGRAM_COUNT; i++) {
803            final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
804            final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
805            if (word0Index == word1Index) {
806                continue;
807            }
808            final String word0 = words.get(word0Index);
809            final String word1 = words.get(word1Index);
810            final int unigramProbability = wordProbabilitiesToCheckLater.get(word1);
811            final int bigramProbability =
812                    unigramProbability + random.nextInt(0xFF - unigramProbability);
813            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
814            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
815                binaryDictionary.flushWithGC();
816            }
817            if (!bigrams.containsKey(word0)) {
818                final HashSet<String> bigramWord1s = new HashSet<>();
819                bigrams.put(word0, bigramWord1s);
820            }
821            bigrams.get(word0).add(word1);
822            bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability);
823        }
824
825        final HashSet<String> wordSet = new HashSet<>(words);
826        final HashSet<Pair<String, String>> bigramSet =
827                new HashSet<>(bigramProbabilitiesToCheckLater.keySet());
828        int token = 0;
829        do {
830            final BinaryDictionary.GetNextWordPropertyResult result =
831                    binaryDictionary.getNextWordProperty(token);
832            final WordProperty wordProperty = result.mWordProperty;
833            final String word0 = wordProperty.mWord;
834            assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
835                    wordProperty.mProbabilityInfo.mProbability);
836            wordSet.remove(word0);
837            final HashSet<String> bigramWord1s = bigrams.get(word0);
838            // TODO: Support ngram.
839            if (wordProperty.mHasNgrams) {
840                for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
841                    final String word1 = bigramTarget.mWord;
842                    assertTrue(bigramWord1s.contains(word1));
843                    final Pair<String, String> bigram = new Pair<>(word0, word1);
844                    final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram);
845                    assertEquals(bigramProbability, bigramTarget.getProbability());
846                    bigramSet.remove(bigram);
847                }
848            }
849            token = result.mNextToken;
850        } while (token != 0);
851        assertTrue(wordSet.isEmpty());
852        assertTrue(bigramSet.isEmpty());
853    }
854
855    public void testPossiblyOffensiveAttributeMaintained() {
856        final BinaryDictionary binaryDictionary =
857                getEmptyBinaryDictionary(FormatSpec.VERSION403);
858        binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0);
859        WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false);
860        assertEquals(true, wordProperty.mIsPossiblyOffensive);
861    }
862
863    public void testBeginningOfSentence() {
864        final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
865        final int dummyProbability = 0;
866        final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
867        final int bigramProbability = 200;
868        addUnigramWord(binaryDictionary, "aaa", dummyProbability);
869        binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
870                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
871        assertEquals(bigramProbability,
872                binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
873        binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
874                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
875        addUnigramWord(binaryDictionary, "bbb", dummyProbability);
876        binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability,
877                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
878        binaryDictionary.flushWithGC();
879        assertEquals(bigramProbability,
880                binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
881        assertEquals(bigramProbability,
882                binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb"));
883    }
884}
885