BinaryDictionaryTests.java revision ff50b39176370ab80a33bfdcf9979603c08a88b3
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.test.AndroidTestCase; 20import android.test.suitebuilder.annotation.LargeTest; 21import android.text.TextUtils; 22import android.util.Pair; 23 24import com.android.inputmethod.latin.makedict.CodePointUtils; 25import com.android.inputmethod.latin.makedict.FormatSpec; 26import com.android.inputmethod.latin.makedict.WeightedString; 27import com.android.inputmethod.latin.makedict.WordProperty; 28import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; 29import com.android.inputmethod.latin.utils.FileUtils; 30import com.android.inputmethod.latin.utils.LanguageModelParam; 31 32import java.io.File; 33import java.io.IOException; 34import java.util.ArrayList; 35import java.util.HashMap; 36import java.util.HashSet; 37import java.util.Locale; 38import java.util.Map; 39import java.util.Random; 40 41// TODO Use the seed passed as an argument for makedict test. 42@LargeTest 43public class BinaryDictionaryTests extends AndroidTestCase { 44 private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; 45 private static final String TEST_LOCALE = "test"; 46 private static final int[] DICT_FORMAT_VERSIONS = 47 new int[] { FormatSpec.VERSION4, FormatSpec.VERSION4_DEV }; 48 49 private static boolean canCheckBigramProbability(final int formatVersion) { 50 return formatVersion >= FormatSpec.VERSION4_DEV; 51 } 52 53 private File createEmptyDictionaryAndGetFile(final String dictId, 54 final int formatVersion) throws IOException { 55 if (formatVersion == FormatSpec.VERSION4 56 || formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING 57 || formatVersion == FormatSpec.VERSION4_DEV) { 58 return createEmptyVer4DictionaryAndGetFile(dictId, formatVersion); 59 } else { 60 throw new IOException("Dictionary format version " + formatVersion 61 + " is not supported."); 62 } 63 } 64 65 private File createEmptyVer4DictionaryAndGetFile(final String dictId, 66 final int formatVersion) throws IOException { 67 final File file = File.createTempFile(dictId, TEST_DICT_FILE_EXTENSION, 68 getContext().getCacheDir()); 69 file.delete(); 70 file.mkdir(); 71 Map<String, String> attributeMap = new HashMap<String, String>(); 72 if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, 73 Locale.ENGLISH, attributeMap)) { 74 return file; 75 } else { 76 throw new IOException("Empty dictionary " + file.getAbsolutePath() 77 + " cannot be created. Format version: " + formatVersion); 78 } 79 } 80 81 public void testIsValidDictionary() { 82 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 83 testIsValidDictionary(formatVersion); 84 } 85 } 86 87 private void testIsValidDictionary(final int formatVersion) { 88 File dictFile = null; 89 try { 90 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 91 } catch (IOException e) { 92 fail("IOException while writing an initial dictionary : " + e); 93 } 94 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 95 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 96 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 97 assertTrue("binaryDictionary must be valid for existing valid dictionary file.", 98 binaryDictionary.isValidDictionary()); 99 binaryDictionary.close(); 100 assertFalse("binaryDictionary must be invalid after closing.", 101 binaryDictionary.isValidDictionary()); 102 FileUtils.deleteRecursively(dictFile); 103 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, 104 dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), 105 TEST_LOCALE, true /* isUpdatable */); 106 assertFalse("binaryDictionary must be invalid for not existing dictionary file.", 107 binaryDictionary.isValidDictionary()); 108 binaryDictionary.close(); 109 } 110 111 public void testConstructingDictionaryOnMemory() { 112 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 113 testConstructingDictionaryOnMemory(formatVersion); 114 } 115 } 116 117 private void testConstructingDictionaryOnMemory(final int formatVersion) { 118 File dictFile = null; 119 try { 120 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 121 } catch (IOException e) { 122 fail("IOException while writing an initial dictionary : " + e); 123 } 124 FileUtils.deleteRecursively(dictFile); 125 assertFalse(dictFile.exists()); 126 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 127 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, formatVersion, 128 new HashMap<String, String>()); 129 assertTrue(binaryDictionary.isValidDictionary()); 130 assertEquals(formatVersion, binaryDictionary.getFormatVersion()); 131 final int probability = 100; 132 addUnigramWord(binaryDictionary, "word", probability); 133 assertEquals(probability, binaryDictionary.getFrequency("word")); 134 assertFalse(dictFile.exists()); 135 binaryDictionary.flush(); 136 assertTrue(dictFile.exists()); 137 assertTrue(binaryDictionary.isValidDictionary()); 138 assertEquals(formatVersion, binaryDictionary.getFormatVersion()); 139 assertEquals(probability, binaryDictionary.getFrequency("word")); 140 binaryDictionary.close(); 141 dictFile.delete(); 142 } 143 144 public void testAddTooLongWord() { 145 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 146 testAddTooLongWord(formatVersion); 147 } 148 } 149 150 private void testAddTooLongWord(final int formatVersion) { 151 File dictFile = null; 152 try { 153 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 154 } catch (IOException e) { 155 fail("IOException while writing an initial dictionary : " + e); 156 } 157 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 158 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 159 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 160 161 final StringBuffer stringBuilder = new StringBuffer(); 162 for (int i = 0; i < Constants.DICTIONARY_MAX_WORD_LENGTH; i++) { 163 stringBuilder.append('a'); 164 } 165 final String validLongWord = stringBuilder.toString(); 166 stringBuilder.append('a'); 167 final String invalidLongWord = stringBuilder.toString(); 168 final int probability = 100; 169 addUnigramWord(binaryDictionary, "aaa", probability); 170 addUnigramWord(binaryDictionary, validLongWord, probability); 171 addUnigramWord(binaryDictionary, invalidLongWord, probability); 172 // Too long short cut. 173 binaryDictionary.addUnigramEntry("a", probability, invalidLongWord, 174 10 /* shortcutProbability */, false /* isNotAWord */, false /* isBlacklisted */, 175 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 176 addUnigramWord(binaryDictionary, "abc", probability); 177 final int updatedProbability = 200; 178 // Update. 179 addUnigramWord(binaryDictionary, validLongWord, updatedProbability); 180 addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); 181 addUnigramWord(binaryDictionary, "abc", updatedProbability); 182 183 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 184 assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); 185 assertEquals(BinaryDictionary.NOT_A_PROBABILITY, 186 binaryDictionary.getFrequency(invalidLongWord)); 187 assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); 188 dictFile.delete(); 189 } 190 191 private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, 192 final int probability) { 193 binaryDictionary.addUnigramEntry(word, probability, "" /* shortcutTarget */, 194 BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */, 195 false /* isNotAWord */, false /* isBlacklisted */, 196 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 197 } 198 199 private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, 200 final String word1, final int probability) { 201 binaryDictionary.addNgramEntry(new PrevWordsInfo(word0), word1, probability, 202 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 203 } 204 205 private static boolean isValidBigram(final BinaryDictionary binaryDictionary, 206 final String word0, final String word1) { 207 return binaryDictionary.isValidNgram(new PrevWordsInfo(word0), word1); 208 } 209 210 private static void removeBigramEntry(final BinaryDictionary binaryDictionary, 211 final String word0, final String word1) { 212 binaryDictionary.removeNgramEntry(new PrevWordsInfo(word0), word1); 213 } 214 215 private static int getBigramProbability(final BinaryDictionary binaryDictionary, 216 final String word0, final String word1) { 217 return binaryDictionary.getNgramProbability(new PrevWordsInfo(word0), word1); 218 } 219 220 public void testAddUnigramWord() { 221 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 222 testAddUnigramWord(formatVersion); 223 } 224 } 225 226 private void testAddUnigramWord(final int formatVersion) { 227 File dictFile = null; 228 try { 229 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 230 } catch (IOException e) { 231 fail("IOException while writing an initial dictionary : " + e); 232 } 233 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 234 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 235 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 236 237 final int probability = 100; 238 addUnigramWord(binaryDictionary, "aaa", probability); 239 // Reallocate and create. 240 addUnigramWord(binaryDictionary, "aab", probability); 241 // Insert into children. 242 addUnigramWord(binaryDictionary, "aac", probability); 243 // Make terminal. 244 addUnigramWord(binaryDictionary, "aa", probability); 245 // Create children. 246 addUnigramWord(binaryDictionary, "aaaa", probability); 247 // Reallocate and make termianl. 248 addUnigramWord(binaryDictionary, "a", probability); 249 250 final int updatedProbability = 200; 251 // Update. 252 addUnigramWord(binaryDictionary, "aaa", updatedProbability); 253 254 assertEquals(probability, binaryDictionary.getFrequency("aab")); 255 assertEquals(probability, binaryDictionary.getFrequency("aac")); 256 assertEquals(probability, binaryDictionary.getFrequency("aa")); 257 assertEquals(probability, binaryDictionary.getFrequency("aaaa")); 258 assertEquals(probability, binaryDictionary.getFrequency("a")); 259 assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); 260 261 dictFile.delete(); 262 } 263 264 public void testRandomlyAddUnigramWord() { 265 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 266 testRandomlyAddUnigramWord(formatVersion); 267 } 268 } 269 270 private void testRandomlyAddUnigramWord(final int formatVersion) { 271 final int wordCount = 1000; 272 final int codePointSetSize = 50; 273 final long seed = System.currentTimeMillis(); 274 275 File dictFile = null; 276 try { 277 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 278 } catch (IOException e) { 279 fail("IOException while writing an initial dictionary : " + e); 280 } 281 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 282 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 283 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 284 285 final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>(); 286 // Test a word that isn't contained within the dictionary. 287 final Random random = new Random(seed); 288 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 289 for (int i = 0; i < wordCount; ++i) { 290 final String word = CodePointUtils.generateWord(random, codePointSet); 291 probabilityMap.put(word, random.nextInt(0xFF)); 292 } 293 for (String word : probabilityMap.keySet()) { 294 addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); 295 } 296 for (String word : probabilityMap.keySet()) { 297 assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); 298 } 299 dictFile.delete(); 300 } 301 302 public void testAddBigramWords() { 303 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 304 testAddBigramWords(formatVersion); 305 } 306 } 307 308 private void testAddBigramWords(final int formatVersion) { 309 File dictFile = null; 310 try { 311 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 312 } catch (IOException e) { 313 fail("IOException while writing an initial dictionary : " + e); 314 } 315 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 316 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 317 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 318 319 final int unigramProbability = 100; 320 final int bigramProbability = 150; 321 final int updatedBigramProbability = 200; 322 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 323 addUnigramWord(binaryDictionary, "abb", unigramProbability); 324 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 325 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 326 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 327 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 328 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 329 330 assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); 331 assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); 332 assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); 333 assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); 334 if (canCheckBigramProbability(formatVersion)) { 335 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 336 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 337 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 338 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 339 } 340 341 addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); 342 if (canCheckBigramProbability(formatVersion)) { 343 assertEquals(updatedBigramProbability, 344 getBigramProbability(binaryDictionary, "aaa", "abb")); 345 } 346 347 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 348 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 349 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 350 assertEquals(Dictionary.NOT_A_PROBABILITY, 351 getBigramProbability(binaryDictionary, "bcc", "aaa")); 352 assertEquals(Dictionary.NOT_A_PROBABILITY, 353 getBigramProbability(binaryDictionary, "bcc", "bbc")); 354 assertEquals(Dictionary.NOT_A_PROBABILITY, 355 getBigramProbability(binaryDictionary, "aaa", "aaa")); 356 357 // Testing bigram link. 358 addUnigramWord(binaryDictionary, "abcde", unigramProbability); 359 addUnigramWord(binaryDictionary, "fghij", unigramProbability); 360 addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); 361 addUnigramWord(binaryDictionary, "fgh", unigramProbability); 362 addUnigramWord(binaryDictionary, "abc", unigramProbability); 363 addUnigramWord(binaryDictionary, "f", unigramProbability); 364 365 if (canCheckBigramProbability(formatVersion)) { 366 assertEquals(bigramProbability, 367 getBigramProbability(binaryDictionary, "abcde", "fghij")); 368 } 369 assertEquals(Dictionary.NOT_A_PROBABILITY, 370 getBigramProbability(binaryDictionary, "abcde", "fgh")); 371 addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); 372 if (canCheckBigramProbability(formatVersion)) { 373 assertEquals(updatedBigramProbability, 374 getBigramProbability(binaryDictionary, "abcde", "fghij")); 375 } 376 377 dictFile.delete(); 378 } 379 380 public void testRandomlyAddBigramWords() { 381 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 382 testRandomlyAddBigramWords(formatVersion); 383 } 384 } 385 386 private void testRandomlyAddBigramWords(final int formatVersion) { 387 final int wordCount = 100; 388 final int bigramCount = 1000; 389 final int codePointSetSize = 50; 390 final long seed = System.currentTimeMillis(); 391 final Random random = new Random(seed); 392 393 File dictFile = null; 394 try { 395 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 396 } catch (IOException e) { 397 fail("IOException while writing an initial dictionary : " + e); 398 } 399 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 400 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 401 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 402 403 final ArrayList<String> words = new ArrayList<String>(); 404 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); 405 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 406 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 407 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 408 new HashMap<Pair<String, String>, Integer>(); 409 410 for (int i = 0; i < wordCount; ++i) { 411 final String word = CodePointUtils.generateWord(random, codePointSet); 412 words.add(word); 413 final int unigramProbability = random.nextInt(0xFF); 414 unigramProbabilities.put(word, unigramProbability); 415 addUnigramWord(binaryDictionary, word, unigramProbability); 416 } 417 418 for (int i = 0; i < bigramCount; i++) { 419 final String word0 = words.get(random.nextInt(wordCount)); 420 final String word1 = words.get(random.nextInt(wordCount)); 421 if (TextUtils.equals(word0, word1)) { 422 continue; 423 } 424 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 425 bigramWords.add(bigram); 426 final int unigramProbability = unigramProbabilities.get(word1); 427 final int bigramProbability = 428 unigramProbability + random.nextInt(0xFF - unigramProbability); 429 bigramProbabilities.put(bigram, bigramProbability); 430 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 431 } 432 433 for (final Pair<String, String> bigram : bigramWords) { 434 final int bigramProbability = bigramProbabilities.get(bigram); 435 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 436 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 437 if (canCheckBigramProbability(formatVersion)) { 438 assertEquals(bigramProbability, 439 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 440 } 441 } 442 443 dictFile.delete(); 444 } 445 446 public void testRemoveBigramWords() { 447 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 448 testRemoveBigramWords(formatVersion); 449 } 450 } 451 452 private void testRemoveBigramWords(final int formatVersion) { 453 File dictFile = null; 454 try { 455 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 456 } catch (IOException e) { 457 fail("IOException while writing an initial dictionary : " + e); 458 } 459 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 460 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 461 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 462 final int unigramProbability = 100; 463 final int bigramProbability = 150; 464 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 465 addUnigramWord(binaryDictionary, "abb", unigramProbability); 466 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 467 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 468 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 469 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 470 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 471 472 assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); 473 assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); 474 assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); 475 assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); 476 477 removeBigramEntry(binaryDictionary, "aaa", "abb"); 478 assertFalse(isValidBigram(binaryDictionary, "aaa", "abb")); 479 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 480 assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); 481 482 483 removeBigramEntry(binaryDictionary, "aaa", "bcc"); 484 assertFalse(isValidBigram(binaryDictionary, "aaa", "bcc")); 485 removeBigramEntry(binaryDictionary, "abb", "aaa"); 486 assertFalse(isValidBigram(binaryDictionary, "abb", "aaa")); 487 removeBigramEntry(binaryDictionary, "abb", "bcc"); 488 assertFalse(isValidBigram(binaryDictionary, "abb", "bcc")); 489 490 removeBigramEntry(binaryDictionary, "aaa", "abb"); 491 // Test remove non-existing bigram operation. 492 removeBigramEntry(binaryDictionary, "aaa", "abb"); 493 removeBigramEntry(binaryDictionary, "bcc", "aaa"); 494 495 dictFile.delete(); 496 } 497 498 public void testFlushDictionary() { 499 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 500 testFlushDictionary(formatVersion); 501 } 502 } 503 504 private void testFlushDictionary(final int formatVersion) { 505 File dictFile = null; 506 try { 507 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 508 } catch (IOException e) { 509 fail("IOException while writing an initial dictionary : " + e); 510 } 511 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 512 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 513 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 514 515 final int probability = 100; 516 addUnigramWord(binaryDictionary, "aaa", probability); 517 addUnigramWord(binaryDictionary, "abcd", probability); 518 // Close without flushing. 519 binaryDictionary.close(); 520 521 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 522 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 523 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 524 525 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); 526 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); 527 528 addUnigramWord(binaryDictionary, "aaa", probability); 529 addUnigramWord(binaryDictionary, "abcd", probability); 530 binaryDictionary.flush(); 531 binaryDictionary.close(); 532 533 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 534 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 535 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 536 537 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 538 assertEquals(probability, binaryDictionary.getFrequency("abcd")); 539 addUnigramWord(binaryDictionary, "bcde", probability); 540 binaryDictionary.flush(); 541 binaryDictionary.close(); 542 543 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 544 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 545 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 546 assertEquals(probability, binaryDictionary.getFrequency("bcde")); 547 binaryDictionary.close(); 548 549 dictFile.delete(); 550 } 551 552 public void testFlushWithGCDictionary() { 553 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 554 testFlushWithGCDictionary(formatVersion); 555 } 556 } 557 558 private void testFlushWithGCDictionary(final int formatVersion) { 559 File dictFile = null; 560 try { 561 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 562 } catch (IOException e) { 563 fail("IOException while writing an initial dictionary : " + e); 564 } 565 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 566 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 567 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 568 569 final int unigramProbability = 100; 570 final int bigramProbability = 150; 571 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 572 addUnigramWord(binaryDictionary, "abb", unigramProbability); 573 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 574 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 575 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 576 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 577 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 578 binaryDictionary.flushWithGC(); 579 binaryDictionary.close(); 580 581 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 582 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 583 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 584 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 585 assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); 586 assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); 587 if (canCheckBigramProbability(formatVersion)) { 588 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 589 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 590 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 591 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 592 } 593 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 594 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 595 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 596 binaryDictionary.flushWithGC(); 597 binaryDictionary.close(); 598 599 dictFile.delete(); 600 } 601 602 public void testAddBigramWordsAndFlashWithGC() { 603 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 604 testAddBigramWordsAndFlashWithGC(formatVersion); 605 } 606 } 607 608 // TODO: Evaluate performance of GC 609 private void testAddBigramWordsAndFlashWithGC(final int formatVersion) { 610 final int wordCount = 100; 611 final int bigramCount = 1000; 612 final int codePointSetSize = 30; 613 final long seed = System.currentTimeMillis(); 614 final Random random = new Random(seed); 615 616 File dictFile = null; 617 try { 618 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 619 } catch (IOException e) { 620 fail("IOException while writing an initial dictionary : " + e); 621 } 622 623 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 624 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 625 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 626 627 final ArrayList<String> words = new ArrayList<String>(); 628 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); 629 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 630 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 631 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 632 new HashMap<Pair<String, String>, Integer>(); 633 634 for (int i = 0; i < wordCount; ++i) { 635 final String word = CodePointUtils.generateWord(random, codePointSet); 636 words.add(word); 637 final int unigramProbability = random.nextInt(0xFF); 638 unigramProbabilities.put(word, unigramProbability); 639 addUnigramWord(binaryDictionary, word, unigramProbability); 640 } 641 642 for (int i = 0; i < bigramCount; i++) { 643 final String word0 = words.get(random.nextInt(wordCount)); 644 final String word1 = words.get(random.nextInt(wordCount)); 645 if (TextUtils.equals(word0, word1)) { 646 continue; 647 } 648 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 649 bigramWords.add(bigram); 650 final int unigramProbability = unigramProbabilities.get(word1); 651 final int bigramProbability = 652 unigramProbability + random.nextInt(0xFF - unigramProbability); 653 bigramProbabilities.put(bigram, bigramProbability); 654 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 655 } 656 657 binaryDictionary.flushWithGC(); 658 binaryDictionary.close(); 659 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 660 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 661 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 662 663 664 for (final Pair<String, String> bigram : bigramWords) { 665 final int bigramProbability = bigramProbabilities.get(bigram); 666 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 667 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 668 if (canCheckBigramProbability(formatVersion)) { 669 assertEquals(bigramProbability, 670 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 671 } 672 } 673 674 dictFile.delete(); 675 } 676 677 public void testRandomOperationsAndFlashWithGC() { 678 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 679 testRandomOperationsAndFlashWithGC(formatVersion); 680 } 681 } 682 683 private void testRandomOperationsAndFlashWithGC(final int formatVersion) { 684 final int flashWithGCIterationCount = 50; 685 final int operationCountInEachIteration = 200; 686 final int initialUnigramCount = 100; 687 final float addUnigramProb = 0.5f; 688 final float addBigramProb = 0.8f; 689 final float removeBigramProb = 0.2f; 690 final int codePointSetSize = 30; 691 692 final long seed = System.currentTimeMillis(); 693 final Random random = new Random(seed); 694 695 File dictFile = null; 696 try { 697 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 698 } catch (IOException e) { 699 fail("IOException while writing an initial dictionary : " + e); 700 } 701 702 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 703 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 704 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 705 final ArrayList<String> words = new ArrayList<String>(); 706 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); 707 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 708 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 709 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 710 new HashMap<Pair<String, String>, Integer>(); 711 for (int i = 0; i < initialUnigramCount; ++i) { 712 final String word = CodePointUtils.generateWord(random, codePointSet); 713 words.add(word); 714 final int unigramProbability = random.nextInt(0xFF); 715 unigramProbabilities.put(word, unigramProbability); 716 addUnigramWord(binaryDictionary, word, unigramProbability); 717 } 718 binaryDictionary.flushWithGC(); 719 binaryDictionary.close(); 720 721 for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { 722 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 723 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 724 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 725 for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { 726 // Add unigram. 727 if (random.nextFloat() < addUnigramProb) { 728 final String word = CodePointUtils.generateWord(random, codePointSet); 729 words.add(word); 730 final int unigramProbability = random.nextInt(0xFF); 731 unigramProbabilities.put(word, unigramProbability); 732 addUnigramWord(binaryDictionary, word, unigramProbability); 733 } 734 // Add bigram. 735 if (random.nextFloat() < addBigramProb && words.size() > 2) { 736 final int word0Index = random.nextInt(words.size()); 737 int word1Index = random.nextInt(words.size() - 1); 738 if (word0Index <= word1Index) { 739 word1Index++; 740 } 741 final String word0 = words.get(word0Index); 742 final String word1 = words.get(word1Index); 743 if (TextUtils.equals(word0, word1)) { 744 continue; 745 } 746 final int unigramProbability = unigramProbabilities.get(word1); 747 final int bigramProbability = 748 unigramProbability + random.nextInt(0xFF - unigramProbability); 749 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 750 bigramWords.add(bigram); 751 bigramProbabilities.put(bigram, bigramProbability); 752 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 753 } 754 // Remove bigram. 755 if (random.nextFloat() < removeBigramProb && !bigramWords.isEmpty()) { 756 final int bigramIndex = random.nextInt(bigramWords.size()); 757 final Pair<String, String> bigram = bigramWords.get(bigramIndex); 758 bigramWords.remove(bigramIndex); 759 bigramProbabilities.remove(bigram); 760 removeBigramEntry(binaryDictionary, bigram.first, bigram.second); 761 } 762 } 763 764 // Test whether the all unigram operations are collectlly handled. 765 for (int i = 0; i < words.size(); i++) { 766 final String word = words.get(i); 767 final int unigramProbability = unigramProbabilities.get(word); 768 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 769 } 770 // Test whether the all bigram operations are collectlly handled. 771 for (int i = 0; i < bigramWords.size(); i++) { 772 final Pair<String, String> bigram = bigramWords.get(i); 773 final int probability; 774 if (bigramProbabilities.containsKey(bigram)) { 775 final int bigramProbability = bigramProbabilities.get(bigram); 776 probability = bigramProbability; 777 } else { 778 probability = Dictionary.NOT_A_PROBABILITY; 779 } 780 781 if (canCheckBigramProbability(formatVersion)) { 782 assertEquals(probability, 783 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 784 } 785 assertEquals(probability != Dictionary.NOT_A_PROBABILITY, 786 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 787 } 788 binaryDictionary.flushWithGC(); 789 binaryDictionary.close(); 790 } 791 792 dictFile.delete(); 793 } 794 795 public void testAddManyUnigramsAndFlushWithGC() { 796 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 797 testAddManyUnigramsAndFlushWithGC(formatVersion); 798 } 799 } 800 801 private void testAddManyUnigramsAndFlushWithGC(final int formatVersion) { 802 final int flashWithGCIterationCount = 3; 803 final int codePointSetSize = 50; 804 805 final long seed = System.currentTimeMillis(); 806 final Random random = new Random(seed); 807 808 File dictFile = null; 809 try { 810 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 811 } catch (IOException e) { 812 fail("IOException while writing an initial dictionary : " + e); 813 } 814 815 final ArrayList<String> words = new ArrayList<String>(); 816 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 817 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 818 819 BinaryDictionary binaryDictionary; 820 for (int i = 0; i < flashWithGCIterationCount; i++) { 821 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 822 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 823 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 824 while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 825 final String word = CodePointUtils.generateWord(random, codePointSet); 826 words.add(word); 827 final int unigramProbability = random.nextInt(0xFF); 828 unigramProbabilities.put(word, unigramProbability); 829 addUnigramWord(binaryDictionary, word, unigramProbability); 830 } 831 832 for (int j = 0; j < words.size(); j++) { 833 final String word = words.get(j); 834 final int unigramProbability = unigramProbabilities.get(word); 835 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 836 } 837 838 binaryDictionary.flushWithGC(); 839 binaryDictionary.close(); 840 } 841 842 dictFile.delete(); 843 } 844 845 public void testUnigramAndBigramCount() { 846 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 847 testUnigramAndBigramCount(formatVersion); 848 } 849 } 850 851 private void testUnigramAndBigramCount(final int formatVersion) { 852 final int flashWithGCIterationCount = 10; 853 final int codePointSetSize = 50; 854 final int unigramCountPerIteration = 1000; 855 final int bigramCountPerIteration = 2000; 856 final long seed = System.currentTimeMillis(); 857 final Random random = new Random(seed); 858 859 File dictFile = null; 860 try { 861 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 862 } catch (IOException e) { 863 fail("IOException while writing an initial dictionary : " + e); 864 } 865 866 final ArrayList<String> words = new ArrayList<String>(); 867 final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>(); 868 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 869 870 BinaryDictionary binaryDictionary; 871 for (int i = 0; i < flashWithGCIterationCount; i++) { 872 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 873 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 874 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 875 for (int j = 0; j < unigramCountPerIteration; j++) { 876 final String word = CodePointUtils.generateWord(random, codePointSet); 877 words.add(word); 878 final int unigramProbability = random.nextInt(0xFF); 879 addUnigramWord(binaryDictionary, word, unigramProbability); 880 } 881 for (int j = 0; j < bigramCountPerIteration; j++) { 882 final String word0 = words.get(random.nextInt(words.size())); 883 final String word1 = words.get(random.nextInt(words.size())); 884 if (TextUtils.equals(word0, word1)) { 885 continue; 886 } 887 bigrams.add(new Pair<String, String>(word0, word1)); 888 final int bigramProbability = random.nextInt(0xF); 889 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 890 } 891 assertEquals(new HashSet<String>(words).size(), Integer.parseInt( 892 binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY))); 893 assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt( 894 binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY))); 895 binaryDictionary.flushWithGC(); 896 assertEquals(new HashSet<String>(words).size(), Integer.parseInt( 897 binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY))); 898 assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt( 899 binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY))); 900 binaryDictionary.close(); 901 } 902 903 dictFile.delete(); 904 } 905 906 public void testAddMultipleDictionaryEntries() { 907 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 908 testAddMultipleDictionaryEntries(formatVersion); 909 } 910 } 911 912 private void testAddMultipleDictionaryEntries(final int formatVersion) { 913 final int codePointSetSize = 20; 914 final int lmParamCount = 1000; 915 final double bigramContinueRate = 0.9; 916 final long seed = System.currentTimeMillis(); 917 final Random random = new Random(seed); 918 919 File dictFile = null; 920 try { 921 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 922 } catch (IOException e) { 923 fail("IOException while writing an initial dictionary : " + e); 924 } 925 926 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 927 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 928 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 929 new HashMap<Pair<String, String>, Integer>(); 930 931 final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount]; 932 String prevWord = null; 933 for (int i = 0; i < languageModelParams.length; i++) { 934 final String word = CodePointUtils.generateWord(random, codePointSet); 935 final int probability = random.nextInt(0xFF); 936 final int bigramProbability = probability + random.nextInt(0xFF - probability); 937 unigramProbabilities.put(word, probability); 938 if (prevWord == null) { 939 languageModelParams[i] = new LanguageModelParam(word, probability, 940 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 941 } else { 942 languageModelParams[i] = new LanguageModelParam(prevWord, word, probability, 943 bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP); 944 bigramProbabilities.put(new Pair<String, String>(prevWord, word), 945 bigramProbability); 946 } 947 prevWord = (random.nextDouble() < bigramContinueRate) ? word : null; 948 } 949 950 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 951 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 952 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 953 binaryDictionary.addMultipleDictionaryEntries(languageModelParams); 954 955 for (Map.Entry<String, Integer> entry : unigramProbabilities.entrySet()) { 956 assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey())); 957 } 958 959 for (Map.Entry<Pair<String, String>, Integer> entry : bigramProbabilities.entrySet()) { 960 final String word0 = entry.getKey().first; 961 final String word1 = entry.getKey().second; 962 final int bigramProbability = entry.getValue(); 963 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 964 isValidBigram(binaryDictionary, word0, word1)); 965 if (canCheckBigramProbability(formatVersion)) { 966 assertEquals(bigramProbability, 967 getBigramProbability(binaryDictionary, word0, word1)); 968 } 969 } 970 } 971 972 public void testGetWordProperties() { 973 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 974 testGetWordProperties(formatVersion); 975 } 976 } 977 978 private void testGetWordProperties(final int formatVersion) { 979 final long seed = System.currentTimeMillis(); 980 final Random random = new Random(seed); 981 final int UNIGRAM_COUNT = 1000; 982 final int BIGRAM_COUNT = 1000; 983 final int codePointSetSize = 20; 984 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 985 986 File dictFile = null; 987 try { 988 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 989 } catch (IOException e) { 990 fail("IOException while writing an initial dictionary : " + e); 991 } 992 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 993 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 994 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 995 996 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); 997 assertFalse(invalidWordProperty.isValid()); 998 999 final ArrayList<String> words = new ArrayList<String>(); 1000 final HashMap<String, Integer> wordProbabilities = new HashMap<String, Integer>(); 1001 final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>(); 1002 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 1003 new HashMap<Pair<String, String>, Integer>(); 1004 1005 for (int i = 0; i < UNIGRAM_COUNT; i++) { 1006 final String word = CodePointUtils.generateWord(random, codePointSet); 1007 final int unigramProbability = random.nextInt(0xFF); 1008 final boolean isNotAWord = random.nextBoolean(); 1009 final boolean isBlacklisted = random.nextBoolean(); 1010 // TODO: Add tests for historical info. 1011 binaryDictionary.addUnigramEntry(word, unigramProbability, 1012 null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY, 1013 isNotAWord, isBlacklisted, BinaryDictionary.NOT_A_VALID_TIMESTAMP); 1014 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 1015 binaryDictionary.flushWithGC(); 1016 } 1017 words.add(word); 1018 wordProbabilities.put(word, unigramProbability); 1019 final WordProperty wordProperty = binaryDictionary.getWordProperty(word); 1020 assertEquals(word, wordProperty.mWord); 1021 assertTrue(wordProperty.isValid()); 1022 assertEquals(isNotAWord, wordProperty.mIsNotAWord); 1023 assertEquals(isBlacklisted, wordProperty.mIsBlacklistEntry); 1024 assertEquals(false, wordProperty.mHasBigrams); 1025 assertEquals(false, wordProperty.mHasShortcuts); 1026 assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); 1027 assertTrue(wordProperty.mShortcutTargets.isEmpty()); 1028 } 1029 1030 for (int i = 0; i < BIGRAM_COUNT; i++) { 1031 final int word0Index = random.nextInt(wordProbabilities.size()); 1032 final int word1Index = random.nextInt(wordProbabilities.size()); 1033 if (word0Index == word1Index) { 1034 continue; 1035 } 1036 final String word0 = words.get(word0Index); 1037 final String word1 = words.get(word1Index); 1038 final int unigramProbability = wordProbabilities.get(word1); 1039 final int bigramProbability = 1040 unigramProbability + random.nextInt(0xFF - unigramProbability); 1041 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 1042 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 1043 binaryDictionary.flushWithGC(); 1044 } 1045 if (!bigrams.containsKey(word0)) { 1046 final HashSet<String> bigramWord1s = new HashSet<String>(); 1047 bigrams.put(word0, bigramWord1s); 1048 } 1049 bigrams.get(word0).add(word1); 1050 bigramProbabilities.put(new Pair<String, String>(word0, word1), bigramProbability); 1051 } 1052 1053 for (int i = 0; i < words.size(); i++) { 1054 final String word0 = words.get(i); 1055 if (!bigrams.containsKey(word0)) { 1056 continue; 1057 } 1058 final HashSet<String> bigramWord1s = bigrams.get(word0); 1059 final WordProperty wordProperty = binaryDictionary.getWordProperty(word0); 1060 assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size()); 1061 for (int j = 0; j < wordProperty.mBigrams.size(); j++) { 1062 final String word1 = wordProperty.mBigrams.get(j).mWord; 1063 assertTrue(bigramWord1s.contains(word1)); 1064 if (canCheckBigramProbability(formatVersion)) { 1065 final int bigramProbability = bigramProbabilities.get( 1066 new Pair<String, String>(word0, word1)); 1067 assertEquals(bigramProbability, wordProperty.mBigrams.get(j).getProbability()); 1068 } 1069 } 1070 } 1071 } 1072 1073 public void testIterateAllWords() { 1074 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 1075 testIterateAllWords(formatVersion); 1076 } 1077 } 1078 1079 private void testIterateAllWords(final int formatVersion) { 1080 final long seed = System.currentTimeMillis(); 1081 final Random random = new Random(seed); 1082 final int UNIGRAM_COUNT = 1000; 1083 final int BIGRAM_COUNT = 1000; 1084 final int codePointSetSize = 20; 1085 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 1086 1087 File dictFile = null; 1088 try { 1089 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 1090 } catch (IOException e) { 1091 fail("IOException while writing an initial dictionary : " + e); 1092 } 1093 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1094 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1095 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1096 1097 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); 1098 assertFalse(invalidWordProperty.isValid()); 1099 1100 final ArrayList<String> words = new ArrayList<String>(); 1101 final HashMap<String, Integer> wordProbabilitiesToCheckLater = 1102 new HashMap<String, Integer>(); 1103 final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>(); 1104 final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater = 1105 new HashMap<Pair<String, String>, Integer>(); 1106 1107 for (int i = 0; i < UNIGRAM_COUNT; i++) { 1108 final String word = CodePointUtils.generateWord(random, codePointSet); 1109 final int unigramProbability = random.nextInt(0xFF); 1110 addUnigramWord(binaryDictionary, word, unigramProbability); 1111 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 1112 binaryDictionary.flushWithGC(); 1113 } 1114 words.add(word); 1115 wordProbabilitiesToCheckLater.put(word, unigramProbability); 1116 } 1117 1118 for (int i = 0; i < BIGRAM_COUNT; i++) { 1119 final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 1120 final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 1121 if (word0Index == word1Index) { 1122 continue; 1123 } 1124 final String word0 = words.get(word0Index); 1125 final String word1 = words.get(word1Index); 1126 final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); 1127 final int bigramProbability = 1128 unigramProbability + random.nextInt(0xFF - unigramProbability); 1129 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 1130 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 1131 binaryDictionary.flushWithGC(); 1132 } 1133 if (!bigrams.containsKey(word0)) { 1134 final HashSet<String> bigramWord1s = new HashSet<String>(); 1135 bigrams.put(word0, bigramWord1s); 1136 } 1137 bigrams.get(word0).add(word1); 1138 bigramProbabilitiesToCheckLater.put( 1139 new Pair<String, String>(word0, word1), bigramProbability); 1140 } 1141 1142 final HashSet<String> wordSet = new HashSet<String>(words); 1143 final HashSet<Pair<String, String>> bigramSet = 1144 new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet()); 1145 int token = 0; 1146 do { 1147 final BinaryDictionary.GetNextWordPropertyResult result = 1148 binaryDictionary.getNextWordProperty(token); 1149 final WordProperty wordProperty = result.mWordProperty; 1150 final String word0 = wordProperty.mWord; 1151 assertEquals((int)wordProbabilitiesToCheckLater.get(word0), 1152 wordProperty.mProbabilityInfo.mProbability); 1153 wordSet.remove(word0); 1154 final HashSet<String> bigramWord1s = bigrams.get(word0); 1155 for (int j = 0; j < wordProperty.mBigrams.size(); j++) { 1156 final String word1 = wordProperty.mBigrams.get(j).mWord; 1157 assertTrue(bigramWord1s.contains(word1)); 1158 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 1159 if (canCheckBigramProbability(formatVersion)) { 1160 final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram); 1161 assertEquals(bigramProbability, wordProperty.mBigrams.get(j).getProbability()); 1162 } 1163 bigramSet.remove(bigram); 1164 } 1165 token = result.mNextToken; 1166 } while (token != 0); 1167 assertTrue(wordSet.isEmpty()); 1168 assertTrue(bigramSet.isEmpty()); 1169 } 1170 1171 public void testAddShortcuts() { 1172 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 1173 testAddShortcuts(formatVersion); 1174 } 1175 } 1176 1177 private void testAddShortcuts(final int formatVersion) { 1178 File dictFile = null; 1179 try { 1180 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 1181 } catch (IOException e) { 1182 fail("IOException while writing an initial dictionary : " + e); 1183 } 1184 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1185 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1186 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1187 1188 final int unigramProbability = 100; 1189 final int shortcutProbability = 10; 1190 binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz", 1191 shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1192 0 /* timestamp */); 1193 WordProperty wordProperty = binaryDictionary.getWordProperty("aaa"); 1194 assertEquals(1, wordProperty.mShortcutTargets.size()); 1195 assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); 1196 assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability()); 1197 final int updatedShortcutProbability = 2; 1198 binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz", 1199 updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1200 0 /* timestamp */); 1201 wordProperty = binaryDictionary.getWordProperty("aaa"); 1202 assertEquals(1, wordProperty.mShortcutTargets.size()); 1203 assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); 1204 assertEquals(updatedShortcutProbability, 1205 wordProperty.mShortcutTargets.get(0).getProbability()); 1206 binaryDictionary.addUnigramEntry("aaa", unigramProbability, "yyy", 1207 shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1208 0 /* timestamp */); 1209 final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>(); 1210 shortcutTargets.put("zzz", updatedShortcutProbability); 1211 shortcutTargets.put("yyy", shortcutProbability); 1212 wordProperty = binaryDictionary.getWordProperty("aaa"); 1213 assertEquals(2, wordProperty.mShortcutTargets.size()); 1214 for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { 1215 assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); 1216 assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), 1217 shortcutTarget.getProbability()); 1218 shortcutTargets.remove(shortcutTarget.mWord); 1219 } 1220 shortcutTargets.put("zzz", updatedShortcutProbability); 1221 shortcutTargets.put("yyy", shortcutProbability); 1222 binaryDictionary.flushWithGC(); 1223 wordProperty = binaryDictionary.getWordProperty("aaa"); 1224 assertEquals(2, wordProperty.mShortcutTargets.size()); 1225 for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { 1226 assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); 1227 assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), 1228 shortcutTarget.getProbability()); 1229 shortcutTargets.remove(shortcutTarget.mWord); 1230 } 1231 } 1232 1233 public void testAddManyShortcuts() { 1234 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 1235 testAddManyShortcuts(formatVersion); 1236 } 1237 } 1238 1239 private void testAddManyShortcuts(final int formatVersion) { 1240 final long seed = System.currentTimeMillis(); 1241 final Random random = new Random(seed); 1242 final int UNIGRAM_COUNT = 1000; 1243 final int SHORTCUT_COUNT = 10000; 1244 final int codePointSetSize = 20; 1245 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 1246 1247 final ArrayList<String> words = new ArrayList<String>(); 1248 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 1249 final HashMap<String, HashMap<String, Integer>> shortcutTargets = 1250 new HashMap<String, HashMap<String, Integer>>(); 1251 1252 File dictFile = null; 1253 try { 1254 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 1255 } catch (IOException e) { 1256 fail("IOException while writing an initial dictionary : " + e); 1257 } 1258 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1259 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1260 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1261 1262 for (int i = 0; i < UNIGRAM_COUNT; i++) { 1263 final String word = CodePointUtils.generateWord(random, codePointSet); 1264 final int unigramProbability = random.nextInt(0xFF); 1265 addUnigramWord(binaryDictionary, word, unigramProbability); 1266 words.add(word); 1267 unigramProbabilities.put(word, unigramProbability); 1268 if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 1269 binaryDictionary.flushWithGC(); 1270 } 1271 } 1272 for (int i = 0; i < SHORTCUT_COUNT; i++) { 1273 final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet); 1274 final int shortcutProbability = random.nextInt(0xF); 1275 final String word = words.get(random.nextInt(words.size())); 1276 final int unigramProbability = unigramProbabilities.get(word); 1277 binaryDictionary.addUnigramEntry(word, unigramProbability, shortcutTarget, 1278 shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1279 0 /* timestamp */); 1280 if (shortcutTargets.containsKey(word)) { 1281 final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word); 1282 shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability); 1283 } else { 1284 final HashMap<String, Integer> shortcutTargetsOfWord = 1285 new HashMap<String, Integer>(); 1286 shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability); 1287 shortcutTargets.put(word, shortcutTargetsOfWord); 1288 } 1289 if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 1290 binaryDictionary.flushWithGC(); 1291 } 1292 } 1293 1294 for (final String word : words) { 1295 final WordProperty wordProperty = binaryDictionary.getWordProperty(word); 1296 assertEquals((int)unigramProbabilities.get(word), 1297 wordProperty.mProbabilityInfo.mProbability); 1298 if (!shortcutTargets.containsKey(word)) { 1299 // The word does not have shortcut targets. 1300 continue; 1301 } 1302 assertEquals(shortcutTargets.get(word).size(), wordProperty.mShortcutTargets.size()); 1303 for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { 1304 final String targetCodePonts = shortcutTarget.mWord; 1305 assertEquals((int)shortcutTargets.get(word).get(targetCodePonts), 1306 shortcutTarget.getProbability()); 1307 } 1308 } 1309 } 1310 1311 public void testDictMigration() { 1312 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 1313 testDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion); 1314 } 1315 } 1316 1317 private void testDictMigration(final int fromFormatVersion, final int toFormatVersion) { 1318 File dictFile = null; 1319 try { 1320 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", fromFormatVersion); 1321 } catch (IOException e) { 1322 fail("IOException while writing an initial dictionary : " + e); 1323 } 1324 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1325 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1326 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1327 final int unigramProbability = 100; 1328 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 1329 addUnigramWord(binaryDictionary, "bbb", unigramProbability); 1330 final int bigramProbability = 150; 1331 addBigramWords(binaryDictionary, "aaa", "bbb", bigramProbability); 1332 final int shortcutProbability = 10; 1333 binaryDictionary.addUnigramEntry("ccc", unigramProbability, "xxx", shortcutProbability, 1334 false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */); 1335 binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */, 1336 Dictionary.NOT_A_PROBABILITY, true /* isNotAWord */, 1337 true /* isBlacklisted */, 0 /* timestamp */); 1338 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 1339 assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb")); 1340 assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb")); 1341 assertEquals(fromFormatVersion, binaryDictionary.getFormatVersion()); 1342 assertTrue(binaryDictionary.migrateTo(toFormatVersion)); 1343 assertTrue(binaryDictionary.isValidDictionary()); 1344 assertEquals(toFormatVersion, binaryDictionary.getFormatVersion()); 1345 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 1346 assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb")); 1347 if (canCheckBigramProbability(toFormatVersion)) { 1348 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb")); 1349 } 1350 assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb")); 1351 WordProperty wordProperty = binaryDictionary.getWordProperty("ccc"); 1352 assertEquals(1, wordProperty.mShortcutTargets.size()); 1353 assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord); 1354 wordProperty = binaryDictionary.getWordProperty("ddd"); 1355 assertTrue(wordProperty.mIsBlacklistEntry); 1356 assertTrue(wordProperty.mIsNotAWord); 1357 } 1358 1359 public void testLargeDictMigration() { 1360 for (final int formatVersion : DICT_FORMAT_VERSIONS) { 1361 testLargeDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion); 1362 } 1363 } 1364 1365 private void testLargeDictMigration(final int fromFormatVersion, final int toFormatVersion) { 1366 final int UNIGRAM_COUNT = 3000; 1367 final int BIGRAM_COUNT = 3000; 1368 final int codePointSetSize = 50; 1369 final long seed = System.currentTimeMillis(); 1370 final Random random = new Random(seed); 1371 1372 File dictFile = null; 1373 try { 1374 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", fromFormatVersion); 1375 } catch (IOException e) { 1376 fail("IOException while writing an initial dictionary : " + e); 1377 } 1378 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1379 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1380 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1381 1382 final ArrayList<String> words = new ArrayList<String>(); 1383 final ArrayList<Pair<String, String>> bigrams = new ArrayList<Pair<String,String>>(); 1384 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 1385 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 1386 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 1387 new HashMap<Pair<String, String>, Integer>(); 1388 1389 for (int i = 0; i < UNIGRAM_COUNT; i++) { 1390 final String word = CodePointUtils.generateWord(random, codePointSet); 1391 final int unigramProbability = random.nextInt(0xFF); 1392 addUnigramWord(binaryDictionary, word, unigramProbability); 1393 if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 1394 binaryDictionary.flushWithGC(); 1395 } 1396 words.add(word); 1397 unigramProbabilities.put(word, unigramProbability); 1398 } 1399 1400 for (int i = 0; i < BIGRAM_COUNT; i++) { 1401 final int word0Index = random.nextInt(words.size()); 1402 final int word1Index = random.nextInt(words.size()); 1403 if (word0Index == word1Index) { 1404 continue; 1405 } 1406 final String word0 = words.get(word0Index); 1407 final String word1 = words.get(word1Index); 1408 final int unigramProbability = unigramProbabilities.get(word1); 1409 final int bigramProbability = 1410 random.nextInt(0xFF - unigramProbability) + unigramProbability; 1411 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 1412 if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 1413 binaryDictionary.flushWithGC(); 1414 } 1415 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 1416 bigrams.add(bigram); 1417 bigramProbabilities.put(bigram, bigramProbability); 1418 } 1419 assertTrue(binaryDictionary.migrateTo(toFormatVersion)); 1420 1421 for (final String word : words) { 1422 assertEquals((int)unigramProbabilities.get(word), binaryDictionary.getFrequency(word)); 1423 } 1424 assertEquals(unigramProbabilities.size(), Integer.parseInt( 1425 binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY))); 1426 1427 for (final Pair<String, String> bigram : bigrams) { 1428 if (canCheckBigramProbability(toFormatVersion)) { 1429 assertEquals((int)bigramProbabilities.get(bigram), 1430 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 1431 } 1432 assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second)); 1433 } 1434 assertEquals(bigramProbabilities.size(), Integer.parseInt( 1435 binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY))); 1436 } 1437} 1438