BinaryDictionaryTests.java revision ff50b39176370ab80a33bfdcf9979603c08a88b3
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.test.AndroidTestCase;
20import android.test.suitebuilder.annotation.LargeTest;
21import android.text.TextUtils;
22import android.util.Pair;
23
24import com.android.inputmethod.latin.makedict.CodePointUtils;
25import com.android.inputmethod.latin.makedict.FormatSpec;
26import com.android.inputmethod.latin.makedict.WeightedString;
27import com.android.inputmethod.latin.makedict.WordProperty;
28import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
29import com.android.inputmethod.latin.utils.FileUtils;
30import com.android.inputmethod.latin.utils.LanguageModelParam;
31
32import java.io.File;
33import java.io.IOException;
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.util.Locale;
38import java.util.Map;
39import java.util.Random;
40
41// TODO Use the seed passed as an argument for makedict test.
42@LargeTest
43public class BinaryDictionaryTests extends AndroidTestCase {
44    private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
45    private static final String TEST_LOCALE = "test";
46    private static final int[] DICT_FORMAT_VERSIONS =
47            new int[] { FormatSpec.VERSION4, FormatSpec.VERSION4_DEV };
48
49    private static boolean canCheckBigramProbability(final int formatVersion) {
50        return formatVersion >= FormatSpec.VERSION4_DEV;
51    }
52
53    private File createEmptyDictionaryAndGetFile(final String dictId,
54            final int formatVersion) throws IOException {
55        if (formatVersion == FormatSpec.VERSION4
56                || formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING
57                || formatVersion == FormatSpec.VERSION4_DEV) {
58            return createEmptyVer4DictionaryAndGetFile(dictId, formatVersion);
59        } else {
60            throw new IOException("Dictionary format version " + formatVersion
61                    + " is not supported.");
62        }
63    }
64
65    private File createEmptyVer4DictionaryAndGetFile(final String dictId,
66            final int formatVersion) throws IOException {
67        final File file = File.createTempFile(dictId, TEST_DICT_FILE_EXTENSION,
68                getContext().getCacheDir());
69        file.delete();
70        file.mkdir();
71        Map<String, String> attributeMap = new HashMap<String, String>();
72        if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion,
73                Locale.ENGLISH, attributeMap)) {
74            return file;
75        } else {
76            throw new IOException("Empty dictionary " + file.getAbsolutePath()
77                    + " cannot be created. Format version: " + formatVersion);
78        }
79    }
80
81    public void testIsValidDictionary() {
82        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
83            testIsValidDictionary(formatVersion);
84        }
85    }
86
87    private void testIsValidDictionary(final int formatVersion) {
88        File dictFile = null;
89        try {
90            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
91        } catch (IOException e) {
92            fail("IOException while writing an initial dictionary : " + e);
93        }
94        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
95                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
96                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
97        assertTrue("binaryDictionary must be valid for existing valid dictionary file.",
98                binaryDictionary.isValidDictionary());
99        binaryDictionary.close();
100        assertFalse("binaryDictionary must be invalid after closing.",
101                binaryDictionary.isValidDictionary());
102        FileUtils.deleteRecursively(dictFile);
103        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */,
104                dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(),
105                TEST_LOCALE, true /* isUpdatable */);
106        assertFalse("binaryDictionary must be invalid for not existing dictionary file.",
107                binaryDictionary.isValidDictionary());
108        binaryDictionary.close();
109    }
110
111    public void testConstructingDictionaryOnMemory() {
112        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
113            testConstructingDictionaryOnMemory(formatVersion);
114        }
115    }
116
117    private void testConstructingDictionaryOnMemory(final int formatVersion) {
118        File dictFile = null;
119        try {
120            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
121        } catch (IOException e) {
122            fail("IOException while writing an initial dictionary : " + e);
123        }
124        FileUtils.deleteRecursively(dictFile);
125        assertFalse(dictFile.exists());
126        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
127                true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, formatVersion,
128                new HashMap<String, String>());
129        assertTrue(binaryDictionary.isValidDictionary());
130        assertEquals(formatVersion, binaryDictionary.getFormatVersion());
131        final int probability = 100;
132        addUnigramWord(binaryDictionary, "word", probability);
133        assertEquals(probability, binaryDictionary.getFrequency("word"));
134        assertFalse(dictFile.exists());
135        binaryDictionary.flush();
136        assertTrue(dictFile.exists());
137        assertTrue(binaryDictionary.isValidDictionary());
138        assertEquals(formatVersion, binaryDictionary.getFormatVersion());
139        assertEquals(probability, binaryDictionary.getFrequency("word"));
140        binaryDictionary.close();
141        dictFile.delete();
142    }
143
144    public void testAddTooLongWord() {
145        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
146            testAddTooLongWord(formatVersion);
147        }
148    }
149
150    private void testAddTooLongWord(final int formatVersion) {
151        File dictFile = null;
152        try {
153            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
154        } catch (IOException e) {
155            fail("IOException while writing an initial dictionary : " + e);
156        }
157        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
158                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
159                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
160
161        final StringBuffer stringBuilder = new StringBuffer();
162        for (int i = 0; i < Constants.DICTIONARY_MAX_WORD_LENGTH; i++) {
163            stringBuilder.append('a');
164        }
165        final String validLongWord = stringBuilder.toString();
166        stringBuilder.append('a');
167        final String invalidLongWord = stringBuilder.toString();
168        final int probability = 100;
169        addUnigramWord(binaryDictionary, "aaa", probability);
170        addUnigramWord(binaryDictionary, validLongWord, probability);
171        addUnigramWord(binaryDictionary, invalidLongWord, probability);
172        // Too long short cut.
173        binaryDictionary.addUnigramEntry("a", probability, invalidLongWord,
174                10 /* shortcutProbability */, false /* isNotAWord */, false /* isBlacklisted */,
175                BinaryDictionary.NOT_A_VALID_TIMESTAMP);
176        addUnigramWord(binaryDictionary, "abc", probability);
177        final int updatedProbability = 200;
178        // Update.
179        addUnigramWord(binaryDictionary, validLongWord, updatedProbability);
180        addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability);
181        addUnigramWord(binaryDictionary, "abc", updatedProbability);
182
183        assertEquals(probability, binaryDictionary.getFrequency("aaa"));
184        assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord));
185        assertEquals(BinaryDictionary.NOT_A_PROBABILITY,
186                binaryDictionary.getFrequency(invalidLongWord));
187        assertEquals(updatedProbability, binaryDictionary.getFrequency("abc"));
188        dictFile.delete();
189    }
190
191    private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
192            final int probability) {
193        binaryDictionary.addUnigramEntry(word, probability, "" /* shortcutTarget */,
194                BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
195                false /* isNotAWord */, false /* isBlacklisted */,
196                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
197    }
198
199    private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
200            final String word1, final int probability) {
201        binaryDictionary.addNgramEntry(new PrevWordsInfo(word0), word1, probability,
202                BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
203    }
204
205    private static boolean isValidBigram(final BinaryDictionary binaryDictionary,
206            final String word0, final String word1) {
207        return binaryDictionary.isValidNgram(new PrevWordsInfo(word0), word1);
208    }
209
210    private static void removeBigramEntry(final BinaryDictionary binaryDictionary,
211            final String word0, final String word1) {
212        binaryDictionary.removeNgramEntry(new PrevWordsInfo(word0), word1);
213    }
214
215    private static int getBigramProbability(final BinaryDictionary binaryDictionary,
216            final String word0,  final String word1) {
217        return binaryDictionary.getNgramProbability(new PrevWordsInfo(word0), word1);
218    }
219
220    public void testAddUnigramWord() {
221        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
222            testAddUnigramWord(formatVersion);
223        }
224    }
225
226    private void testAddUnigramWord(final int formatVersion) {
227        File dictFile = null;
228        try {
229            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
230        } catch (IOException e) {
231            fail("IOException while writing an initial dictionary : " + e);
232        }
233        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
234                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
235                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
236
237        final int probability = 100;
238        addUnigramWord(binaryDictionary, "aaa", probability);
239        // Reallocate and create.
240        addUnigramWord(binaryDictionary, "aab", probability);
241        // Insert into children.
242        addUnigramWord(binaryDictionary, "aac", probability);
243        // Make terminal.
244        addUnigramWord(binaryDictionary, "aa", probability);
245        // Create children.
246        addUnigramWord(binaryDictionary, "aaaa", probability);
247        // Reallocate and make termianl.
248        addUnigramWord(binaryDictionary, "a", probability);
249
250        final int updatedProbability = 200;
251        // Update.
252        addUnigramWord(binaryDictionary, "aaa", updatedProbability);
253
254        assertEquals(probability, binaryDictionary.getFrequency("aab"));
255        assertEquals(probability, binaryDictionary.getFrequency("aac"));
256        assertEquals(probability, binaryDictionary.getFrequency("aa"));
257        assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
258        assertEquals(probability, binaryDictionary.getFrequency("a"));
259        assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
260
261        dictFile.delete();
262    }
263
264    public void testRandomlyAddUnigramWord() {
265        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
266            testRandomlyAddUnigramWord(formatVersion);
267        }
268    }
269
270    private void testRandomlyAddUnigramWord(final int formatVersion) {
271        final int wordCount = 1000;
272        final int codePointSetSize = 50;
273        final long seed = System.currentTimeMillis();
274
275        File dictFile = null;
276        try {
277            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
278        } catch (IOException e) {
279            fail("IOException while writing an initial dictionary : " + e);
280        }
281        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
282                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
283                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
284
285        final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>();
286        // Test a word that isn't contained within the dictionary.
287        final Random random = new Random(seed);
288        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
289        for (int i = 0; i < wordCount; ++i) {
290            final String word = CodePointUtils.generateWord(random, codePointSet);
291            probabilityMap.put(word, random.nextInt(0xFF));
292        }
293        for (String word : probabilityMap.keySet()) {
294            addUnigramWord(binaryDictionary, word, probabilityMap.get(word));
295        }
296        for (String word : probabilityMap.keySet()) {
297            assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
298        }
299        dictFile.delete();
300    }
301
302    public void testAddBigramWords() {
303        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
304            testAddBigramWords(formatVersion);
305        }
306    }
307
308    private void testAddBigramWords(final int formatVersion) {
309        File dictFile = null;
310        try {
311            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
312        } catch (IOException e) {
313            fail("IOException while writing an initial dictionary : " + e);
314        }
315        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
316                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
317                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
318
319        final int unigramProbability = 100;
320        final int bigramProbability = 150;
321        final int updatedBigramProbability = 200;
322        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
323        addUnigramWord(binaryDictionary, "abb", unigramProbability);
324        addUnigramWord(binaryDictionary, "bcc", unigramProbability);
325        addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
326        addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
327        addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
328        addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
329
330        assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
331        assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc"));
332        assertTrue(isValidBigram(binaryDictionary, "abb", "aaa"));
333        assertTrue(isValidBigram(binaryDictionary, "abb", "bcc"));
334        if (canCheckBigramProbability(formatVersion)) {
335            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
336            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
337            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
338            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
339        }
340
341        addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability);
342        if (canCheckBigramProbability(formatVersion)) {
343            assertEquals(updatedBigramProbability,
344                    getBigramProbability(binaryDictionary, "aaa", "abb"));
345        }
346
347        assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
348        assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
349        assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
350        assertEquals(Dictionary.NOT_A_PROBABILITY,
351                getBigramProbability(binaryDictionary, "bcc", "aaa"));
352        assertEquals(Dictionary.NOT_A_PROBABILITY,
353                getBigramProbability(binaryDictionary, "bcc", "bbc"));
354        assertEquals(Dictionary.NOT_A_PROBABILITY,
355                getBigramProbability(binaryDictionary, "aaa", "aaa"));
356
357        // Testing bigram link.
358        addUnigramWord(binaryDictionary, "abcde", unigramProbability);
359        addUnigramWord(binaryDictionary, "fghij", unigramProbability);
360        addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability);
361        addUnigramWord(binaryDictionary, "fgh", unigramProbability);
362        addUnigramWord(binaryDictionary, "abc", unigramProbability);
363        addUnigramWord(binaryDictionary, "f", unigramProbability);
364
365        if (canCheckBigramProbability(formatVersion)) {
366            assertEquals(bigramProbability,
367                    getBigramProbability(binaryDictionary, "abcde", "fghij"));
368        }
369        assertEquals(Dictionary.NOT_A_PROBABILITY,
370                getBigramProbability(binaryDictionary, "abcde", "fgh"));
371        addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability);
372        if (canCheckBigramProbability(formatVersion)) {
373            assertEquals(updatedBigramProbability,
374                    getBigramProbability(binaryDictionary, "abcde", "fghij"));
375        }
376
377        dictFile.delete();
378    }
379
380    public void testRandomlyAddBigramWords() {
381        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
382            testRandomlyAddBigramWords(formatVersion);
383        }
384    }
385
386    private void testRandomlyAddBigramWords(final int formatVersion) {
387        final int wordCount = 100;
388        final int bigramCount = 1000;
389        final int codePointSetSize = 50;
390        final long seed = System.currentTimeMillis();
391        final Random random = new Random(seed);
392
393        File dictFile = null;
394        try {
395            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
396        } catch (IOException e) {
397            fail("IOException while writing an initial dictionary : " + e);
398        }
399        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
400                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
401                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
402
403        final ArrayList<String> words = new ArrayList<String>();
404        final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>();
405        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
406        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
407        final HashMap<Pair<String, String>, Integer> bigramProbabilities =
408                new HashMap<Pair<String, String>, Integer>();
409
410        for (int i = 0; i < wordCount; ++i) {
411            final String word = CodePointUtils.generateWord(random, codePointSet);
412            words.add(word);
413            final int unigramProbability = random.nextInt(0xFF);
414            unigramProbabilities.put(word, unigramProbability);
415            addUnigramWord(binaryDictionary, word, unigramProbability);
416        }
417
418        for (int i = 0; i < bigramCount; i++) {
419            final String word0 = words.get(random.nextInt(wordCount));
420            final String word1 = words.get(random.nextInt(wordCount));
421            if (TextUtils.equals(word0, word1)) {
422                continue;
423            }
424            final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
425            bigramWords.add(bigram);
426            final int unigramProbability = unigramProbabilities.get(word1);
427            final int bigramProbability =
428                    unigramProbability + random.nextInt(0xFF - unigramProbability);
429            bigramProbabilities.put(bigram, bigramProbability);
430            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
431        }
432
433        for (final Pair<String, String> bigram : bigramWords) {
434            final int bigramProbability = bigramProbabilities.get(bigram);
435            assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
436                    isValidBigram(binaryDictionary, bigram.first, bigram.second));
437            if (canCheckBigramProbability(formatVersion)) {
438                assertEquals(bigramProbability,
439                        getBigramProbability(binaryDictionary, bigram.first, bigram.second));
440            }
441        }
442
443        dictFile.delete();
444    }
445
446    public void testRemoveBigramWords() {
447        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
448            testRemoveBigramWords(formatVersion);
449        }
450    }
451
452    private void testRemoveBigramWords(final int formatVersion) {
453        File dictFile = null;
454        try {
455            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
456        } catch (IOException e) {
457            fail("IOException while writing an initial dictionary : " + e);
458        }
459        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
460                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
461                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
462        final int unigramProbability = 100;
463        final int bigramProbability = 150;
464        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
465        addUnigramWord(binaryDictionary, "abb", unigramProbability);
466        addUnigramWord(binaryDictionary, "bcc", unigramProbability);
467        addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
468        addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
469        addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
470        addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
471
472        assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
473        assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc"));
474        assertTrue(isValidBigram(binaryDictionary, "abb", "aaa"));
475        assertTrue(isValidBigram(binaryDictionary, "abb", "bcc"));
476
477        removeBigramEntry(binaryDictionary, "aaa", "abb");
478        assertFalse(isValidBigram(binaryDictionary, "aaa", "abb"));
479        addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
480        assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
481
482
483        removeBigramEntry(binaryDictionary, "aaa", "bcc");
484        assertFalse(isValidBigram(binaryDictionary, "aaa", "bcc"));
485        removeBigramEntry(binaryDictionary, "abb", "aaa");
486        assertFalse(isValidBigram(binaryDictionary, "abb", "aaa"));
487        removeBigramEntry(binaryDictionary, "abb", "bcc");
488        assertFalse(isValidBigram(binaryDictionary, "abb", "bcc"));
489
490        removeBigramEntry(binaryDictionary, "aaa", "abb");
491        // Test remove non-existing bigram operation.
492        removeBigramEntry(binaryDictionary, "aaa", "abb");
493        removeBigramEntry(binaryDictionary, "bcc", "aaa");
494
495        dictFile.delete();
496    }
497
498    public void testFlushDictionary() {
499        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
500            testFlushDictionary(formatVersion);
501        }
502    }
503
504    private void testFlushDictionary(final int formatVersion) {
505        File dictFile = null;
506        try {
507            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
508        } catch (IOException e) {
509            fail("IOException while writing an initial dictionary : " + e);
510        }
511        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
512                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
513                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
514
515        final int probability = 100;
516        addUnigramWord(binaryDictionary, "aaa", probability);
517        addUnigramWord(binaryDictionary, "abcd", probability);
518        // Close without flushing.
519        binaryDictionary.close();
520
521        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
522                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
523                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
524
525        assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa"));
526        assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd"));
527
528        addUnigramWord(binaryDictionary, "aaa", probability);
529        addUnigramWord(binaryDictionary, "abcd", probability);
530        binaryDictionary.flush();
531        binaryDictionary.close();
532
533        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
534                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
535                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
536
537        assertEquals(probability, binaryDictionary.getFrequency("aaa"));
538        assertEquals(probability, binaryDictionary.getFrequency("abcd"));
539        addUnigramWord(binaryDictionary, "bcde", probability);
540        binaryDictionary.flush();
541        binaryDictionary.close();
542
543        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
544                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
545                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
546        assertEquals(probability, binaryDictionary.getFrequency("bcde"));
547        binaryDictionary.close();
548
549        dictFile.delete();
550    }
551
552    public void testFlushWithGCDictionary() {
553        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
554            testFlushWithGCDictionary(formatVersion);
555        }
556    }
557
558    private void testFlushWithGCDictionary(final int formatVersion) {
559        File dictFile = null;
560        try {
561            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
562        } catch (IOException e) {
563            fail("IOException while writing an initial dictionary : " + e);
564        }
565        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
566                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
567                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
568
569        final int unigramProbability = 100;
570        final int bigramProbability = 150;
571        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
572        addUnigramWord(binaryDictionary, "abb", unigramProbability);
573        addUnigramWord(binaryDictionary, "bcc", unigramProbability);
574        addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
575        addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
576        addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
577        addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
578        binaryDictionary.flushWithGC();
579        binaryDictionary.close();
580
581        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
582                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
583                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
584        assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
585        assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
586        assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
587        if (canCheckBigramProbability(formatVersion)) {
588            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
589            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
590            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
591            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
592        }
593        assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
594        assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
595        assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
596        binaryDictionary.flushWithGC();
597        binaryDictionary.close();
598
599        dictFile.delete();
600    }
601
602    public void testAddBigramWordsAndFlashWithGC() {
603        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
604            testAddBigramWordsAndFlashWithGC(formatVersion);
605        }
606    }
607
608    // TODO: Evaluate performance of GC
609    private void testAddBigramWordsAndFlashWithGC(final int formatVersion) {
610        final int wordCount = 100;
611        final int bigramCount = 1000;
612        final int codePointSetSize = 30;
613        final long seed = System.currentTimeMillis();
614        final Random random = new Random(seed);
615
616        File dictFile = null;
617        try {
618            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
619        } catch (IOException e) {
620            fail("IOException while writing an initial dictionary : " + e);
621        }
622
623        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
624                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
625                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
626
627        final ArrayList<String> words = new ArrayList<String>();
628        final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>();
629        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
630        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
631        final HashMap<Pair<String, String>, Integer> bigramProbabilities =
632                new HashMap<Pair<String, String>, Integer>();
633
634        for (int i = 0; i < wordCount; ++i) {
635            final String word = CodePointUtils.generateWord(random, codePointSet);
636            words.add(word);
637            final int unigramProbability = random.nextInt(0xFF);
638            unigramProbabilities.put(word, unigramProbability);
639            addUnigramWord(binaryDictionary, word, unigramProbability);
640        }
641
642        for (int i = 0; i < bigramCount; i++) {
643            final String word0 = words.get(random.nextInt(wordCount));
644            final String word1 = words.get(random.nextInt(wordCount));
645            if (TextUtils.equals(word0, word1)) {
646                continue;
647            }
648            final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
649            bigramWords.add(bigram);
650            final int unigramProbability = unigramProbabilities.get(word1);
651            final int bigramProbability =
652                    unigramProbability + random.nextInt(0xFF - unigramProbability);
653            bigramProbabilities.put(bigram, bigramProbability);
654            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
655        }
656
657        binaryDictionary.flushWithGC();
658        binaryDictionary.close();
659        binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
660                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
661                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
662
663
664        for (final Pair<String, String> bigram : bigramWords) {
665            final int bigramProbability = bigramProbabilities.get(bigram);
666            assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
667                    isValidBigram(binaryDictionary, bigram.first, bigram.second));
668            if (canCheckBigramProbability(formatVersion)) {
669                assertEquals(bigramProbability,
670                        getBigramProbability(binaryDictionary, bigram.first, bigram.second));
671            }
672        }
673
674        dictFile.delete();
675    }
676
677    public void testRandomOperationsAndFlashWithGC() {
678        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
679            testRandomOperationsAndFlashWithGC(formatVersion);
680        }
681    }
682
683    private void testRandomOperationsAndFlashWithGC(final int formatVersion) {
684        final int flashWithGCIterationCount = 50;
685        final int operationCountInEachIteration = 200;
686        final int initialUnigramCount = 100;
687        final float addUnigramProb = 0.5f;
688        final float addBigramProb = 0.8f;
689        final float removeBigramProb = 0.2f;
690        final int codePointSetSize = 30;
691
692        final long seed = System.currentTimeMillis();
693        final Random random = new Random(seed);
694
695        File dictFile = null;
696        try {
697            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
698        } catch (IOException e) {
699            fail("IOException while writing an initial dictionary : " + e);
700        }
701
702        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
703                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
704                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
705        final ArrayList<String> words = new ArrayList<String>();
706        final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>();
707        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
708        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
709        final HashMap<Pair<String, String>, Integer> bigramProbabilities =
710                new HashMap<Pair<String, String>, Integer>();
711        for (int i = 0; i < initialUnigramCount; ++i) {
712            final String word = CodePointUtils.generateWord(random, codePointSet);
713            words.add(word);
714            final int unigramProbability = random.nextInt(0xFF);
715            unigramProbabilities.put(word, unigramProbability);
716            addUnigramWord(binaryDictionary, word, unigramProbability);
717        }
718        binaryDictionary.flushWithGC();
719        binaryDictionary.close();
720
721        for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) {
722            binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
723                    0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
724                    Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
725            for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) {
726                // Add unigram.
727                if (random.nextFloat() < addUnigramProb) {
728                    final String word = CodePointUtils.generateWord(random, codePointSet);
729                    words.add(word);
730                    final int unigramProbability = random.nextInt(0xFF);
731                    unigramProbabilities.put(word, unigramProbability);
732                    addUnigramWord(binaryDictionary, word, unigramProbability);
733                }
734                // Add bigram.
735                if (random.nextFloat() < addBigramProb && words.size() > 2) {
736                    final int word0Index = random.nextInt(words.size());
737                    int word1Index = random.nextInt(words.size() - 1);
738                    if (word0Index <= word1Index) {
739                        word1Index++;
740                    }
741                    final String word0 = words.get(word0Index);
742                    final String word1 = words.get(word1Index);
743                    if (TextUtils.equals(word0, word1)) {
744                        continue;
745                    }
746                    final int unigramProbability = unigramProbabilities.get(word1);
747                    final int bigramProbability =
748                            unigramProbability + random.nextInt(0xFF - unigramProbability);
749                    final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
750                    bigramWords.add(bigram);
751                    bigramProbabilities.put(bigram, bigramProbability);
752                    addBigramWords(binaryDictionary, word0, word1, bigramProbability);
753                }
754                // Remove bigram.
755                if (random.nextFloat() < removeBigramProb && !bigramWords.isEmpty()) {
756                    final int bigramIndex = random.nextInt(bigramWords.size());
757                    final Pair<String, String> bigram = bigramWords.get(bigramIndex);
758                    bigramWords.remove(bigramIndex);
759                    bigramProbabilities.remove(bigram);
760                    removeBigramEntry(binaryDictionary, bigram.first, bigram.second);
761                }
762            }
763
764            // Test whether the all unigram operations are collectlly handled.
765            for (int i = 0; i < words.size(); i++) {
766                final String word = words.get(i);
767                final int unigramProbability = unigramProbabilities.get(word);
768                assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
769            }
770            // Test whether the all bigram operations are collectlly handled.
771            for (int i = 0; i < bigramWords.size(); i++) {
772                final Pair<String, String> bigram = bigramWords.get(i);
773                final int probability;
774                if (bigramProbabilities.containsKey(bigram)) {
775                    final int bigramProbability = bigramProbabilities.get(bigram);
776                    probability = bigramProbability;
777                } else {
778                    probability = Dictionary.NOT_A_PROBABILITY;
779                }
780
781                if (canCheckBigramProbability(formatVersion)) {
782                    assertEquals(probability,
783                            getBigramProbability(binaryDictionary, bigram.first, bigram.second));
784                }
785                assertEquals(probability != Dictionary.NOT_A_PROBABILITY,
786                        isValidBigram(binaryDictionary, bigram.first, bigram.second));
787            }
788            binaryDictionary.flushWithGC();
789            binaryDictionary.close();
790        }
791
792        dictFile.delete();
793    }
794
795    public void testAddManyUnigramsAndFlushWithGC() {
796        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
797            testAddManyUnigramsAndFlushWithGC(formatVersion);
798        }
799    }
800
801    private void testAddManyUnigramsAndFlushWithGC(final int formatVersion) {
802        final int flashWithGCIterationCount = 3;
803        final int codePointSetSize = 50;
804
805        final long seed = System.currentTimeMillis();
806        final Random random = new Random(seed);
807
808        File dictFile = null;
809        try {
810            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
811        } catch (IOException e) {
812            fail("IOException while writing an initial dictionary : " + e);
813        }
814
815        final ArrayList<String> words = new ArrayList<String>();
816        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
817        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
818
819        BinaryDictionary binaryDictionary;
820        for (int i = 0; i < flashWithGCIterationCount; i++) {
821            binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
822                    0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
823                    Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
824            while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
825                final String word = CodePointUtils.generateWord(random, codePointSet);
826                words.add(word);
827                final int unigramProbability = random.nextInt(0xFF);
828                unigramProbabilities.put(word, unigramProbability);
829                addUnigramWord(binaryDictionary, word, unigramProbability);
830            }
831
832            for (int j = 0; j < words.size(); j++) {
833                final String word = words.get(j);
834                final int unigramProbability = unigramProbabilities.get(word);
835                assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
836            }
837
838            binaryDictionary.flushWithGC();
839            binaryDictionary.close();
840        }
841
842        dictFile.delete();
843    }
844
845    public void testUnigramAndBigramCount() {
846        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
847            testUnigramAndBigramCount(formatVersion);
848        }
849    }
850
851    private void testUnigramAndBigramCount(final int formatVersion) {
852        final int flashWithGCIterationCount = 10;
853        final int codePointSetSize = 50;
854        final int unigramCountPerIteration = 1000;
855        final int bigramCountPerIteration = 2000;
856        final long seed = System.currentTimeMillis();
857        final Random random = new Random(seed);
858
859        File dictFile = null;
860        try {
861            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
862        } catch (IOException e) {
863            fail("IOException while writing an initial dictionary : " + e);
864        }
865
866        final ArrayList<String> words = new ArrayList<String>();
867        final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>();
868        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
869
870        BinaryDictionary binaryDictionary;
871        for (int i = 0; i < flashWithGCIterationCount; i++) {
872            binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
873                    0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
874                    Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
875            for (int j = 0; j < unigramCountPerIteration; j++) {
876                final String word = CodePointUtils.generateWord(random, codePointSet);
877                words.add(word);
878                final int unigramProbability = random.nextInt(0xFF);
879                addUnigramWord(binaryDictionary, word, unigramProbability);
880            }
881            for (int j = 0; j < bigramCountPerIteration; j++) {
882                final String word0 = words.get(random.nextInt(words.size()));
883                final String word1 = words.get(random.nextInt(words.size()));
884                if (TextUtils.equals(word0, word1)) {
885                    continue;
886                }
887                bigrams.add(new Pair<String, String>(word0, word1));
888                final int bigramProbability = random.nextInt(0xF);
889                addBigramWords(binaryDictionary, word0, word1, bigramProbability);
890            }
891            assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
892                    binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
893            assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
894                    binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY)));
895            binaryDictionary.flushWithGC();
896            assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
897                    binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
898            assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
899                    binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY)));
900            binaryDictionary.close();
901        }
902
903        dictFile.delete();
904    }
905
906    public void testAddMultipleDictionaryEntries() {
907        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
908            testAddMultipleDictionaryEntries(formatVersion);
909        }
910    }
911
912    private void testAddMultipleDictionaryEntries(final int formatVersion) {
913        final int codePointSetSize = 20;
914        final int lmParamCount = 1000;
915        final double bigramContinueRate = 0.9;
916        final long seed = System.currentTimeMillis();
917        final Random random = new Random(seed);
918
919        File dictFile = null;
920        try {
921            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
922        } catch (IOException e) {
923            fail("IOException while writing an initial dictionary : " + e);
924        }
925
926        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
927        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
928        final HashMap<Pair<String, String>, Integer> bigramProbabilities =
929                new HashMap<Pair<String, String>, Integer>();
930
931        final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount];
932        String prevWord = null;
933        for (int i = 0; i < languageModelParams.length; i++) {
934            final String word = CodePointUtils.generateWord(random, codePointSet);
935            final int probability = random.nextInt(0xFF);
936            final int bigramProbability = probability + random.nextInt(0xFF - probability);
937            unigramProbabilities.put(word, probability);
938            if (prevWord == null) {
939                languageModelParams[i] = new LanguageModelParam(word, probability,
940                        BinaryDictionary.NOT_A_VALID_TIMESTAMP);
941            } else {
942                languageModelParams[i] = new LanguageModelParam(prevWord, word, probability,
943                        bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
944                bigramProbabilities.put(new Pair<String, String>(prevWord, word),
945                        bigramProbability);
946            }
947            prevWord = (random.nextDouble() < bigramContinueRate) ? word : null;
948        }
949
950        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
951                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
952                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
953        binaryDictionary.addMultipleDictionaryEntries(languageModelParams);
954
955        for (Map.Entry<String, Integer> entry : unigramProbabilities.entrySet()) {
956            assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey()));
957        }
958
959        for (Map.Entry<Pair<String, String>, Integer> entry : bigramProbabilities.entrySet()) {
960            final String word0 = entry.getKey().first;
961            final String word1 = entry.getKey().second;
962            final int bigramProbability = entry.getValue();
963            assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
964                    isValidBigram(binaryDictionary, word0, word1));
965            if (canCheckBigramProbability(formatVersion)) {
966                assertEquals(bigramProbability,
967                        getBigramProbability(binaryDictionary, word0, word1));
968            }
969        }
970    }
971
972    public void testGetWordProperties() {
973        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
974            testGetWordProperties(formatVersion);
975        }
976    }
977
978    private void testGetWordProperties(final int formatVersion) {
979        final long seed = System.currentTimeMillis();
980        final Random random = new Random(seed);
981        final int UNIGRAM_COUNT = 1000;
982        final int BIGRAM_COUNT = 1000;
983        final int codePointSetSize = 20;
984        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
985
986        File dictFile = null;
987        try {
988            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
989        } catch (IOException e) {
990            fail("IOException while writing an initial dictionary : " + e);
991        }
992        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
993                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
994                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
995
996        final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
997        assertFalse(invalidWordProperty.isValid());
998
999        final ArrayList<String> words = new ArrayList<String>();
1000        final HashMap<String, Integer> wordProbabilities = new HashMap<String, Integer>();
1001        final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>();
1002        final HashMap<Pair<String, String>, Integer> bigramProbabilities =
1003                new HashMap<Pair<String, String>, Integer>();
1004
1005        for (int i = 0; i < UNIGRAM_COUNT; i++) {
1006            final String word = CodePointUtils.generateWord(random, codePointSet);
1007            final int unigramProbability = random.nextInt(0xFF);
1008            final boolean isNotAWord = random.nextBoolean();
1009            final boolean isBlacklisted = random.nextBoolean();
1010            // TODO: Add tests for historical info.
1011            binaryDictionary.addUnigramEntry(word, unigramProbability,
1012                    null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
1013                    isNotAWord, isBlacklisted, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
1014            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
1015                binaryDictionary.flushWithGC();
1016            }
1017            words.add(word);
1018            wordProbabilities.put(word, unigramProbability);
1019            final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
1020            assertEquals(word, wordProperty.mWord);
1021            assertTrue(wordProperty.isValid());
1022            assertEquals(isNotAWord, wordProperty.mIsNotAWord);
1023            assertEquals(isBlacklisted, wordProperty.mIsBlacklistEntry);
1024            assertEquals(false, wordProperty.mHasBigrams);
1025            assertEquals(false, wordProperty.mHasShortcuts);
1026            assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
1027            assertTrue(wordProperty.mShortcutTargets.isEmpty());
1028        }
1029
1030        for (int i = 0; i < BIGRAM_COUNT; i++) {
1031            final int word0Index = random.nextInt(wordProbabilities.size());
1032            final int word1Index = random.nextInt(wordProbabilities.size());
1033            if (word0Index == word1Index) {
1034                continue;
1035            }
1036            final String word0 = words.get(word0Index);
1037            final String word1 = words.get(word1Index);
1038            final int unigramProbability = wordProbabilities.get(word1);
1039            final int bigramProbability =
1040                    unigramProbability + random.nextInt(0xFF - unigramProbability);
1041            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
1042            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
1043                binaryDictionary.flushWithGC();
1044            }
1045            if (!bigrams.containsKey(word0)) {
1046                final HashSet<String> bigramWord1s = new HashSet<String>();
1047                bigrams.put(word0, bigramWord1s);
1048            }
1049            bigrams.get(word0).add(word1);
1050            bigramProbabilities.put(new Pair<String, String>(word0, word1), bigramProbability);
1051        }
1052
1053        for (int i = 0; i < words.size(); i++) {
1054            final String word0 = words.get(i);
1055            if (!bigrams.containsKey(word0)) {
1056                continue;
1057            }
1058            final HashSet<String> bigramWord1s = bigrams.get(word0);
1059            final WordProperty wordProperty = binaryDictionary.getWordProperty(word0);
1060            assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size());
1061            for (int j = 0; j < wordProperty.mBigrams.size(); j++) {
1062                final String word1 = wordProperty.mBigrams.get(j).mWord;
1063                assertTrue(bigramWord1s.contains(word1));
1064                if (canCheckBigramProbability(formatVersion)) {
1065                    final int bigramProbability = bigramProbabilities.get(
1066                            new Pair<String, String>(word0, word1));
1067                    assertEquals(bigramProbability, wordProperty.mBigrams.get(j).getProbability());
1068                }
1069            }
1070        }
1071    }
1072
1073    public void testIterateAllWords() {
1074        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
1075            testIterateAllWords(formatVersion);
1076        }
1077    }
1078
1079    private void testIterateAllWords(final int formatVersion) {
1080        final long seed = System.currentTimeMillis();
1081        final Random random = new Random(seed);
1082        final int UNIGRAM_COUNT = 1000;
1083        final int BIGRAM_COUNT = 1000;
1084        final int codePointSetSize = 20;
1085        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
1086
1087        File dictFile = null;
1088        try {
1089            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
1090        } catch (IOException e) {
1091            fail("IOException while writing an initial dictionary : " + e);
1092        }
1093        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
1094                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
1095                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
1096
1097        final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
1098        assertFalse(invalidWordProperty.isValid());
1099
1100        final ArrayList<String> words = new ArrayList<String>();
1101        final HashMap<String, Integer> wordProbabilitiesToCheckLater =
1102                new HashMap<String, Integer>();
1103        final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>();
1104        final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
1105                new HashMap<Pair<String, String>, Integer>();
1106
1107        for (int i = 0; i < UNIGRAM_COUNT; i++) {
1108            final String word = CodePointUtils.generateWord(random, codePointSet);
1109            final int unigramProbability = random.nextInt(0xFF);
1110            addUnigramWord(binaryDictionary, word, unigramProbability);
1111            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
1112                binaryDictionary.flushWithGC();
1113            }
1114            words.add(word);
1115            wordProbabilitiesToCheckLater.put(word, unigramProbability);
1116        }
1117
1118        for (int i = 0; i < BIGRAM_COUNT; i++) {
1119            final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
1120            final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
1121            if (word0Index == word1Index) {
1122                continue;
1123            }
1124            final String word0 = words.get(word0Index);
1125            final String word1 = words.get(word1Index);
1126            final int unigramProbability = wordProbabilitiesToCheckLater.get(word1);
1127            final int bigramProbability =
1128                    unigramProbability + random.nextInt(0xFF - unigramProbability);
1129            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
1130            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
1131                binaryDictionary.flushWithGC();
1132            }
1133            if (!bigrams.containsKey(word0)) {
1134                final HashSet<String> bigramWord1s = new HashSet<String>();
1135                bigrams.put(word0, bigramWord1s);
1136            }
1137            bigrams.get(word0).add(word1);
1138            bigramProbabilitiesToCheckLater.put(
1139                    new Pair<String, String>(word0, word1), bigramProbability);
1140        }
1141
1142        final HashSet<String> wordSet = new HashSet<String>(words);
1143        final HashSet<Pair<String, String>> bigramSet =
1144                new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet());
1145        int token = 0;
1146        do {
1147            final BinaryDictionary.GetNextWordPropertyResult result =
1148                    binaryDictionary.getNextWordProperty(token);
1149            final WordProperty wordProperty = result.mWordProperty;
1150            final String word0 = wordProperty.mWord;
1151            assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
1152                    wordProperty.mProbabilityInfo.mProbability);
1153            wordSet.remove(word0);
1154            final HashSet<String> bigramWord1s = bigrams.get(word0);
1155            for (int j = 0; j < wordProperty.mBigrams.size(); j++) {
1156                final String word1 = wordProperty.mBigrams.get(j).mWord;
1157                assertTrue(bigramWord1s.contains(word1));
1158                final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
1159                if (canCheckBigramProbability(formatVersion)) {
1160                    final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram);
1161                    assertEquals(bigramProbability, wordProperty.mBigrams.get(j).getProbability());
1162                }
1163                bigramSet.remove(bigram);
1164            }
1165            token = result.mNextToken;
1166        } while (token != 0);
1167        assertTrue(wordSet.isEmpty());
1168        assertTrue(bigramSet.isEmpty());
1169    }
1170
1171    public void testAddShortcuts() {
1172        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
1173            testAddShortcuts(formatVersion);
1174        }
1175    }
1176
1177    private void testAddShortcuts(final int formatVersion) {
1178        File dictFile = null;
1179        try {
1180            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
1181        } catch (IOException e) {
1182            fail("IOException while writing an initial dictionary : " + e);
1183        }
1184        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
1185                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
1186                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
1187
1188        final int unigramProbability = 100;
1189        final int shortcutProbability = 10;
1190        binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
1191                shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
1192                0 /* timestamp */);
1193        WordProperty wordProperty = binaryDictionary.getWordProperty("aaa");
1194        assertEquals(1, wordProperty.mShortcutTargets.size());
1195        assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
1196        assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability());
1197        final int updatedShortcutProbability = 2;
1198        binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
1199                updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
1200                0 /* timestamp */);
1201        wordProperty = binaryDictionary.getWordProperty("aaa");
1202        assertEquals(1, wordProperty.mShortcutTargets.size());
1203        assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
1204        assertEquals(updatedShortcutProbability,
1205                wordProperty.mShortcutTargets.get(0).getProbability());
1206        binaryDictionary.addUnigramEntry("aaa", unigramProbability, "yyy",
1207                shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
1208                0 /* timestamp */);
1209        final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>();
1210        shortcutTargets.put("zzz", updatedShortcutProbability);
1211        shortcutTargets.put("yyy", shortcutProbability);
1212        wordProperty = binaryDictionary.getWordProperty("aaa");
1213        assertEquals(2, wordProperty.mShortcutTargets.size());
1214        for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
1215            assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
1216            assertEquals((int)shortcutTargets.get(shortcutTarget.mWord),
1217                    shortcutTarget.getProbability());
1218            shortcutTargets.remove(shortcutTarget.mWord);
1219        }
1220        shortcutTargets.put("zzz", updatedShortcutProbability);
1221        shortcutTargets.put("yyy", shortcutProbability);
1222        binaryDictionary.flushWithGC();
1223        wordProperty = binaryDictionary.getWordProperty("aaa");
1224        assertEquals(2, wordProperty.mShortcutTargets.size());
1225        for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
1226            assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
1227            assertEquals((int)shortcutTargets.get(shortcutTarget.mWord),
1228                    shortcutTarget.getProbability());
1229            shortcutTargets.remove(shortcutTarget.mWord);
1230        }
1231    }
1232
1233    public void testAddManyShortcuts() {
1234        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
1235            testAddManyShortcuts(formatVersion);
1236        }
1237    }
1238
1239    private void testAddManyShortcuts(final int formatVersion) {
1240        final long seed = System.currentTimeMillis();
1241        final Random random = new Random(seed);
1242        final int UNIGRAM_COUNT = 1000;
1243        final int SHORTCUT_COUNT = 10000;
1244        final int codePointSetSize = 20;
1245        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
1246
1247        final ArrayList<String> words = new ArrayList<String>();
1248        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
1249        final HashMap<String, HashMap<String, Integer>> shortcutTargets =
1250                new HashMap<String, HashMap<String, Integer>>();
1251
1252        File dictFile = null;
1253        try {
1254            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
1255        } catch (IOException e) {
1256            fail("IOException while writing an initial dictionary : " + e);
1257        }
1258        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
1259                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
1260                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
1261
1262        for (int i = 0; i < UNIGRAM_COUNT; i++) {
1263            final String word = CodePointUtils.generateWord(random, codePointSet);
1264            final int unigramProbability = random.nextInt(0xFF);
1265            addUnigramWord(binaryDictionary, word, unigramProbability);
1266            words.add(word);
1267            unigramProbabilities.put(word, unigramProbability);
1268            if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
1269                binaryDictionary.flushWithGC();
1270            }
1271        }
1272        for (int i = 0; i < SHORTCUT_COUNT; i++) {
1273            final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet);
1274            final int shortcutProbability = random.nextInt(0xF);
1275            final String word = words.get(random.nextInt(words.size()));
1276            final int unigramProbability = unigramProbabilities.get(word);
1277            binaryDictionary.addUnigramEntry(word, unigramProbability, shortcutTarget,
1278                    shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
1279                    0 /* timestamp */);
1280            if (shortcutTargets.containsKey(word)) {
1281                final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
1282                shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
1283            } else {
1284                final HashMap<String, Integer> shortcutTargetsOfWord =
1285                        new HashMap<String, Integer>();
1286                shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
1287                shortcutTargets.put(word, shortcutTargetsOfWord);
1288            }
1289            if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
1290                binaryDictionary.flushWithGC();
1291            }
1292        }
1293
1294        for (final String word : words) {
1295            final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
1296            assertEquals((int)unigramProbabilities.get(word),
1297                    wordProperty.mProbabilityInfo.mProbability);
1298            if (!shortcutTargets.containsKey(word)) {
1299                // The word does not have shortcut targets.
1300                continue;
1301            }
1302            assertEquals(shortcutTargets.get(word).size(), wordProperty.mShortcutTargets.size());
1303            for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
1304                final String targetCodePonts = shortcutTarget.mWord;
1305                assertEquals((int)shortcutTargets.get(word).get(targetCodePonts),
1306                        shortcutTarget.getProbability());
1307            }
1308        }
1309    }
1310
1311    public void testDictMigration() {
1312        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
1313            testDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
1314        }
1315    }
1316
1317    private void testDictMigration(final int fromFormatVersion, final int toFormatVersion) {
1318        File dictFile = null;
1319        try {
1320            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", fromFormatVersion);
1321        } catch (IOException e) {
1322            fail("IOException while writing an initial dictionary : " + e);
1323        }
1324        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
1325                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
1326                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
1327        final int unigramProbability = 100;
1328        addUnigramWord(binaryDictionary, "aaa", unigramProbability);
1329        addUnigramWord(binaryDictionary, "bbb", unigramProbability);
1330        final int bigramProbability = 150;
1331        addBigramWords(binaryDictionary, "aaa", "bbb", bigramProbability);
1332        final int shortcutProbability = 10;
1333        binaryDictionary.addUnigramEntry("ccc", unigramProbability, "xxx", shortcutProbability,
1334                false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
1335        binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
1336                Dictionary.NOT_A_PROBABILITY, true /* isNotAWord */,
1337                true /* isBlacklisted */, 0 /* timestamp */);
1338        assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
1339        assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
1340        assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
1341        assertEquals(fromFormatVersion, binaryDictionary.getFormatVersion());
1342        assertTrue(binaryDictionary.migrateTo(toFormatVersion));
1343        assertTrue(binaryDictionary.isValidDictionary());
1344        assertEquals(toFormatVersion, binaryDictionary.getFormatVersion());
1345        assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
1346        assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
1347        if (canCheckBigramProbability(toFormatVersion)) {
1348            assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb"));
1349        }
1350        assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
1351        WordProperty wordProperty = binaryDictionary.getWordProperty("ccc");
1352        assertEquals(1, wordProperty.mShortcutTargets.size());
1353        assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
1354        wordProperty = binaryDictionary.getWordProperty("ddd");
1355        assertTrue(wordProperty.mIsBlacklistEntry);
1356        assertTrue(wordProperty.mIsNotAWord);
1357    }
1358
1359    public void testLargeDictMigration() {
1360        for (final int formatVersion : DICT_FORMAT_VERSIONS) {
1361            testLargeDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
1362        }
1363    }
1364
1365    private void testLargeDictMigration(final int fromFormatVersion, final int toFormatVersion) {
1366        final int UNIGRAM_COUNT = 3000;
1367        final int BIGRAM_COUNT = 3000;
1368        final int codePointSetSize = 50;
1369        final long seed = System.currentTimeMillis();
1370        final Random random = new Random(seed);
1371
1372        File dictFile = null;
1373        try {
1374            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", fromFormatVersion);
1375        } catch (IOException e) {
1376            fail("IOException while writing an initial dictionary : " + e);
1377        }
1378        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
1379                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
1380                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
1381
1382        final ArrayList<String> words = new ArrayList<String>();
1383        final ArrayList<Pair<String, String>> bigrams = new ArrayList<Pair<String,String>>();
1384        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
1385        final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
1386        final HashMap<Pair<String, String>, Integer> bigramProbabilities =
1387                new HashMap<Pair<String, String>, Integer>();
1388
1389        for (int i = 0; i < UNIGRAM_COUNT; i++) {
1390            final String word = CodePointUtils.generateWord(random, codePointSet);
1391            final int unigramProbability = random.nextInt(0xFF);
1392            addUnigramWord(binaryDictionary, word, unigramProbability);
1393            if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
1394                binaryDictionary.flushWithGC();
1395            }
1396            words.add(word);
1397            unigramProbabilities.put(word, unigramProbability);
1398        }
1399
1400        for (int i = 0; i < BIGRAM_COUNT; i++) {
1401            final int word0Index = random.nextInt(words.size());
1402            final int word1Index = random.nextInt(words.size());
1403            if (word0Index == word1Index) {
1404                continue;
1405            }
1406            final String word0 = words.get(word0Index);
1407            final String word1 = words.get(word1Index);
1408            final int unigramProbability = unigramProbabilities.get(word1);
1409            final int bigramProbability =
1410                    random.nextInt(0xFF - unigramProbability) + unigramProbability;
1411            addBigramWords(binaryDictionary, word0, word1, bigramProbability);
1412            if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
1413                binaryDictionary.flushWithGC();
1414            }
1415            final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
1416            bigrams.add(bigram);
1417            bigramProbabilities.put(bigram, bigramProbability);
1418        }
1419        assertTrue(binaryDictionary.migrateTo(toFormatVersion));
1420
1421        for (final String word : words) {
1422            assertEquals((int)unigramProbabilities.get(word), binaryDictionary.getFrequency(word));
1423        }
1424        assertEquals(unigramProbabilities.size(), Integer.parseInt(
1425                binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
1426
1427        for (final Pair<String, String> bigram : bigrams) {
1428            if (canCheckBigramProbability(toFormatVersion)) {
1429                assertEquals((int)bigramProbabilities.get(bigram),
1430                        getBigramProbability(binaryDictionary, bigram.first, bigram.second));
1431            }
1432            assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second));
1433        }
1434        assertEquals(bigramProbabilities.size(), Integer.parseInt(
1435                binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY)));
1436    }
1437}
1438