BinaryDictionaryTests.java revision 98705b6bf544cff4d781fae8b1ef4e3fabc6b2a3
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.test.AndroidTestCase; 20import android.test.suitebuilder.annotation.LargeTest; 21import android.text.TextUtils; 22import android.util.Pair; 23 24import com.android.inputmethod.latin.makedict.CodePointUtils; 25import com.android.inputmethod.latin.makedict.FormatSpec; 26import com.android.inputmethod.latin.makedict.WeightedString; 27import com.android.inputmethod.latin.makedict.WordProperty; 28import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; 29import com.android.inputmethod.latin.utils.FileUtils; 30import com.android.inputmethod.latin.utils.LanguageModelParam; 31 32import java.io.File; 33import java.io.IOException; 34import java.util.ArrayList; 35import java.util.HashMap; 36import java.util.HashSet; 37import java.util.Locale; 38import java.util.Map; 39import java.util.Random; 40 41// TODO Use the seed passed as an argument for makedict test. 42@LargeTest 43public class BinaryDictionaryTests extends AndroidTestCase { 44 private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; 45 private static final String TEST_LOCALE = "test"; 46 47 private File createEmptyDictionaryAndGetFile(final String dictId, 48 final int formatVersion) throws IOException { 49 if (formatVersion == FormatSpec.VERSION4) { 50 return createEmptyVer4DictionaryAndGetFile(dictId); 51 } else { 52 throw new IOException("Dictionary format version " + formatVersion 53 + " is not supported."); 54 } 55 } 56 57 private File createEmptyVer4DictionaryAndGetFile(final String dictId) throws IOException { 58 final File file = File.createTempFile(dictId, TEST_DICT_FILE_EXTENSION, 59 getContext().getCacheDir()); 60 file.delete(); 61 file.mkdir(); 62 Map<String, String> attributeMap = new HashMap<String, String>(); 63 if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), FormatSpec.VERSION4, 64 Locale.ENGLISH, attributeMap)) { 65 return file; 66 } else { 67 throw new IOException("Empty dictionary " + file.getAbsolutePath() 68 + " cannot be created."); 69 } 70 } 71 72 public void testIsValidDictionary() { 73 testIsValidDictionary(FormatSpec.VERSION4); 74 } 75 76 private void testIsValidDictionary(final int formatVersion) { 77 File dictFile = null; 78 try { 79 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 80 } catch (IOException e) { 81 fail("IOException while writing an initial dictionary : " + e); 82 } 83 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 84 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 85 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 86 assertTrue("binaryDictionary must be valid for existing valid dictionary file.", 87 binaryDictionary.isValidDictionary()); 88 binaryDictionary.close(); 89 assertFalse("binaryDictionary must be invalid after closing.", 90 binaryDictionary.isValidDictionary()); 91 FileUtils.deleteRecursively(dictFile); 92 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, 93 dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), 94 TEST_LOCALE, true /* isUpdatable */); 95 assertFalse("binaryDictionary must be invalid for not existing dictionary file.", 96 binaryDictionary.isValidDictionary()); 97 binaryDictionary.close(); 98 } 99 100 public void testAddTooLongWord() { 101 testAddTooLongWord(FormatSpec.VERSION4); 102 } 103 104 private void testAddTooLongWord(final int formatVersion) { 105 File dictFile = null; 106 try { 107 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 108 } catch (IOException e) { 109 fail("IOException while writing an initial dictionary : " + e); 110 } 111 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 112 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 113 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 114 115 final StringBuffer stringBuilder = new StringBuffer(); 116 for (int i = 0; i < Constants.DICTIONARY_MAX_WORD_LENGTH; i++) { 117 stringBuilder.append('a'); 118 } 119 final String validLongWord = stringBuilder.toString(); 120 stringBuilder.append('a'); 121 final String invalidLongWord = stringBuilder.toString(); 122 final int probability = 100; 123 addUnigramWord(binaryDictionary, "aaa", probability); 124 addUnigramWord(binaryDictionary, validLongWord, probability); 125 addUnigramWord(binaryDictionary, invalidLongWord, probability); 126 // Too long short cut. 127 binaryDictionary.addUnigramWord("a", probability, invalidLongWord, 128 10 /* shortcutProbability */, false /* isNotAWord */, false /* isBlacklisted */, 129 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 130 addUnigramWord(binaryDictionary, "abc", probability); 131 final int updatedProbability = 200; 132 // Update. 133 addUnigramWord(binaryDictionary, validLongWord, updatedProbability); 134 addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); 135 addUnigramWord(binaryDictionary, "abc", updatedProbability); 136 137 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 138 assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); 139 assertEquals(BinaryDictionary.NOT_A_PROBABILITY, 140 binaryDictionary.getFrequency(invalidLongWord)); 141 assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); 142 dictFile.delete(); 143 } 144 145 private void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, 146 final int probability) { 147 binaryDictionary.addUnigramWord(word, probability, "" /* shortcutTarget */, 148 BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */, 149 false /* isNotAWord */, false /* isBlacklisted */, 150 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 151 } 152 153 private void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, 154 final String word1, final int probability) { 155 binaryDictionary.addBigramWords(word0, word1, probability, 156 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 157 } 158 159 public void testAddUnigramWord() { 160 testAddUnigramWord(FormatSpec.VERSION4); 161 } 162 163 private void testAddUnigramWord(final int formatVersion) { 164 File dictFile = null; 165 try { 166 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 167 } catch (IOException e) { 168 fail("IOException while writing an initial dictionary : " + e); 169 } 170 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 171 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 172 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 173 174 final int probability = 100; 175 addUnigramWord(binaryDictionary, "aaa", probability); 176 // Reallocate and create. 177 addUnigramWord(binaryDictionary, "aab", probability); 178 // Insert into children. 179 addUnigramWord(binaryDictionary, "aac", probability); 180 // Make terminal. 181 addUnigramWord(binaryDictionary, "aa", probability); 182 // Create children. 183 addUnigramWord(binaryDictionary, "aaaa", probability); 184 // Reallocate and make termianl. 185 addUnigramWord(binaryDictionary, "a", probability); 186 187 final int updatedProbability = 200; 188 // Update. 189 addUnigramWord(binaryDictionary, "aaa", updatedProbability); 190 191 assertEquals(probability, binaryDictionary.getFrequency("aab")); 192 assertEquals(probability, binaryDictionary.getFrequency("aac")); 193 assertEquals(probability, binaryDictionary.getFrequency("aa")); 194 assertEquals(probability, binaryDictionary.getFrequency("aaaa")); 195 assertEquals(probability, binaryDictionary.getFrequency("a")); 196 assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); 197 198 dictFile.delete(); 199 } 200 201 public void testRandomlyAddUnigramWord() { 202 testRandomlyAddUnigramWord(FormatSpec.VERSION4); 203 } 204 205 private void testRandomlyAddUnigramWord(final int formatVersion) { 206 final int wordCount = 1000; 207 final int codePointSetSize = 50; 208 final long seed = System.currentTimeMillis(); 209 210 File dictFile = null; 211 try { 212 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 213 } catch (IOException e) { 214 fail("IOException while writing an initial dictionary : " + e); 215 } 216 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 217 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 218 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 219 220 final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>(); 221 // Test a word that isn't contained within the dictionary. 222 final Random random = new Random(seed); 223 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 224 for (int i = 0; i < wordCount; ++i) { 225 final String word = CodePointUtils.generateWord(random, codePointSet); 226 probabilityMap.put(word, random.nextInt(0xFF)); 227 } 228 for (String word : probabilityMap.keySet()) { 229 addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); 230 } 231 for (String word : probabilityMap.keySet()) { 232 assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); 233 } 234 dictFile.delete(); 235 } 236 237 public void testAddBigramWords() { 238 testAddBigramWords(FormatSpec.VERSION4); 239 } 240 241 private void testAddBigramWords(final int formatVersion) { 242 File dictFile = null; 243 try { 244 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 245 } catch (IOException e) { 246 fail("IOException while writing an initial dictionary : " + e); 247 } 248 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 249 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 250 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 251 252 final int unigramProbability = 100; 253 final int bigramProbability = 10; 254 final int updatedBigramProbability = 15; 255 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 256 addUnigramWord(binaryDictionary, "abb", unigramProbability); 257 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 258 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 259 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 260 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 261 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 262 263 final int probability = binaryDictionary.calculateProbability(unigramProbability, 264 bigramProbability); 265 assertEquals(true, binaryDictionary.isValidBigram("aaa", "abb")); 266 assertEquals(true, binaryDictionary.isValidBigram("aaa", "bcc")); 267 assertEquals(true, binaryDictionary.isValidBigram("abb", "aaa")); 268 assertEquals(true, binaryDictionary.isValidBigram("abb", "bcc")); 269 assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "abb")); 270 assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "bcc")); 271 assertEquals(probability, binaryDictionary.getBigramProbability("abb", "aaa")); 272 assertEquals(probability, binaryDictionary.getBigramProbability("abb", "bcc")); 273 274 addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); 275 final int updatedProbability = binaryDictionary.calculateProbability(unigramProbability, 276 updatedBigramProbability); 277 assertEquals(updatedProbability, binaryDictionary.getBigramProbability("aaa", "abb")); 278 279 assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa")); 280 assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc")); 281 assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa")); 282 assertEquals(Dictionary.NOT_A_PROBABILITY, 283 binaryDictionary.getBigramProbability("bcc", "aaa")); 284 assertEquals(Dictionary.NOT_A_PROBABILITY, 285 binaryDictionary.getBigramProbability("bcc", "bbc")); 286 assertEquals(Dictionary.NOT_A_PROBABILITY, 287 binaryDictionary.getBigramProbability("aaa", "aaa")); 288 289 // Testing bigram link. 290 addUnigramWord(binaryDictionary, "abcde", unigramProbability); 291 addUnigramWord(binaryDictionary, "fghij", unigramProbability); 292 addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); 293 addUnigramWord(binaryDictionary, "fgh", unigramProbability); 294 addUnigramWord(binaryDictionary, "abc", unigramProbability); 295 addUnigramWord(binaryDictionary, "f", unigramProbability); 296 assertEquals(probability, binaryDictionary.getBigramProbability("abcde", "fghij")); 297 assertEquals(Dictionary.NOT_A_PROBABILITY, 298 binaryDictionary.getBigramProbability("abcde", "fgh")); 299 addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); 300 assertEquals(updatedProbability, binaryDictionary.getBigramProbability("abcde", "fghij")); 301 302 dictFile.delete(); 303 } 304 305 public void testRandomlyAddBigramWords() { 306 testRandomlyAddBigramWords(FormatSpec.VERSION4); 307 } 308 309 private void testRandomlyAddBigramWords(final int formatVersion) { 310 final int wordCount = 100; 311 final int bigramCount = 1000; 312 final int codePointSetSize = 50; 313 final long seed = System.currentTimeMillis(); 314 final Random random = new Random(seed); 315 316 File dictFile = null; 317 try { 318 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 319 } catch (IOException e) { 320 fail("IOException while writing an initial dictionary : " + e); 321 } 322 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 323 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 324 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 325 326 final ArrayList<String> words = new ArrayList<String>(); 327 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); 328 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 329 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 330 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 331 new HashMap<Pair<String, String>, Integer>(); 332 333 for (int i = 0; i < wordCount; ++i) { 334 final String word = CodePointUtils.generateWord(random, codePointSet); 335 words.add(word); 336 final int unigramProbability = random.nextInt(0xFF); 337 unigramProbabilities.put(word, unigramProbability); 338 addUnigramWord(binaryDictionary, word, unigramProbability); 339 } 340 341 for (int i = 0; i < bigramCount; i++) { 342 final String word0 = words.get(random.nextInt(wordCount)); 343 final String word1 = words.get(random.nextInt(wordCount)); 344 if (TextUtils.equals(word0, word1)) { 345 continue; 346 } 347 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 348 bigramWords.add(bigram); 349 final int bigramProbability = random.nextInt(0xF); 350 bigramProbabilities.put(bigram, bigramProbability); 351 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 352 } 353 354 for (final Pair<String, String> bigram : bigramWords) { 355 final int unigramProbability = unigramProbabilities.get(bigram.second); 356 final int bigramProbability = bigramProbabilities.get(bigram); 357 final int probability = binaryDictionary.calculateProbability(unigramProbability, 358 bigramProbability); 359 assertEquals(probability, 360 binaryDictionary.getBigramProbability(bigram.first, bigram.second)); 361 } 362 363 dictFile.delete(); 364 } 365 366 public void testRemoveBigramWords() { 367 testRemoveBigramWords(FormatSpec.VERSION4); 368 } 369 370 private void testRemoveBigramWords(final int formatVersion) { 371 File dictFile = null; 372 try { 373 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 374 } catch (IOException e) { 375 fail("IOException while writing an initial dictionary : " + e); 376 } 377 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 378 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 379 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 380 final int unigramProbability = 100; 381 final int bigramProbability = 10; 382 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 383 addUnigramWord(binaryDictionary, "abb", unigramProbability); 384 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 385 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 386 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 387 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 388 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 389 390 assertEquals(true, binaryDictionary.isValidBigram("aaa", "abb")); 391 assertEquals(true, binaryDictionary.isValidBigram("aaa", "bcc")); 392 assertEquals(true, binaryDictionary.isValidBigram("abb", "aaa")); 393 assertEquals(true, binaryDictionary.isValidBigram("abb", "bcc")); 394 395 binaryDictionary.removeBigramWords("aaa", "abb"); 396 assertEquals(false, binaryDictionary.isValidBigram("aaa", "abb")); 397 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 398 assertEquals(true, binaryDictionary.isValidBigram("aaa", "abb")); 399 400 401 binaryDictionary.removeBigramWords("aaa", "bcc"); 402 assertEquals(false, binaryDictionary.isValidBigram("aaa", "bcc")); 403 binaryDictionary.removeBigramWords("abb", "aaa"); 404 assertEquals(false, binaryDictionary.isValidBigram("abb", "aaa")); 405 binaryDictionary.removeBigramWords("abb", "bcc"); 406 assertEquals(false, binaryDictionary.isValidBigram("abb", "bcc")); 407 408 binaryDictionary.removeBigramWords("aaa", "abb"); 409 // Test remove non-existing bigram operation. 410 binaryDictionary.removeBigramWords("aaa", "abb"); 411 binaryDictionary.removeBigramWords("bcc", "aaa"); 412 413 dictFile.delete(); 414 } 415 416 public void testFlushDictionary() { 417 testFlushDictionary(FormatSpec.VERSION4); 418 } 419 420 private void testFlushDictionary(final int formatVersion) { 421 File dictFile = null; 422 try { 423 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 424 } catch (IOException e) { 425 fail("IOException while writing an initial dictionary : " + e); 426 } 427 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 428 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 429 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 430 431 final int probability = 100; 432 addUnigramWord(binaryDictionary, "aaa", probability); 433 addUnigramWord(binaryDictionary, "abcd", probability); 434 // Close without flushing. 435 binaryDictionary.close(); 436 437 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 438 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 439 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 440 441 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); 442 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); 443 444 addUnigramWord(binaryDictionary, "aaa", probability); 445 addUnigramWord(binaryDictionary, "abcd", probability); 446 binaryDictionary.flush(); 447 binaryDictionary.close(); 448 449 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 450 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 451 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 452 453 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 454 assertEquals(probability, binaryDictionary.getFrequency("abcd")); 455 addUnigramWord(binaryDictionary, "bcde", probability); 456 binaryDictionary.flush(); 457 binaryDictionary.close(); 458 459 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 460 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 461 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 462 assertEquals(probability, binaryDictionary.getFrequency("bcde")); 463 binaryDictionary.close(); 464 465 dictFile.delete(); 466 } 467 468 public void testFlushWithGCDictionary() { 469 testFlushWithGCDictionary(FormatSpec.VERSION4); 470 } 471 472 private void testFlushWithGCDictionary(final int formatVersion) { 473 File dictFile = null; 474 try { 475 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 476 } catch (IOException e) { 477 fail("IOException while writing an initial dictionary : " + e); 478 } 479 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 480 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 481 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 482 483 final int unigramProbability = 100; 484 final int bigramProbability = 10; 485 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 486 addUnigramWord(binaryDictionary, "abb", unigramProbability); 487 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 488 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 489 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 490 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 491 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 492 binaryDictionary.flushWithGC(); 493 binaryDictionary.close(); 494 495 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 496 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 497 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 498 final int probability = binaryDictionary.calculateProbability(unigramProbability, 499 bigramProbability); 500 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 501 assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); 502 assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); 503 assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "abb")); 504 assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "bcc")); 505 assertEquals(probability, binaryDictionary.getBigramProbability("abb", "aaa")); 506 assertEquals(probability, binaryDictionary.getBigramProbability("abb", "bcc")); 507 assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa")); 508 assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc")); 509 assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa")); 510 binaryDictionary.flushWithGC(); 511 binaryDictionary.close(); 512 513 dictFile.delete(); 514 } 515 516 public void testAddBigramWordsAndFlashWithGC() { 517 testAddBigramWordsAndFlashWithGC(FormatSpec.VERSION4); 518 } 519 520 // TODO: Evaluate performance of GC 521 private void testAddBigramWordsAndFlashWithGC(final int formatVersion) { 522 final int wordCount = 100; 523 final int bigramCount = 1000; 524 final int codePointSetSize = 30; 525 final long seed = System.currentTimeMillis(); 526 final Random random = new Random(seed); 527 528 File dictFile = null; 529 try { 530 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 531 } catch (IOException e) { 532 fail("IOException while writing an initial dictionary : " + e); 533 } 534 535 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 536 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 537 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 538 539 final ArrayList<String> words = new ArrayList<String>(); 540 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); 541 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 542 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 543 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 544 new HashMap<Pair<String, String>, Integer>(); 545 546 for (int i = 0; i < wordCount; ++i) { 547 final String word = CodePointUtils.generateWord(random, codePointSet); 548 words.add(word); 549 final int unigramProbability = random.nextInt(0xFF); 550 unigramProbabilities.put(word, unigramProbability); 551 addUnigramWord(binaryDictionary, word, unigramProbability); 552 } 553 554 for (int i = 0; i < bigramCount; i++) { 555 final String word0 = words.get(random.nextInt(wordCount)); 556 final String word1 = words.get(random.nextInt(wordCount)); 557 if (TextUtils.equals(word0, word1)) { 558 continue; 559 } 560 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 561 bigramWords.add(bigram); 562 final int bigramProbability = random.nextInt(0xF); 563 bigramProbabilities.put(bigram, bigramProbability); 564 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 565 } 566 567 binaryDictionary.flushWithGC(); 568 binaryDictionary.close(); 569 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 570 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 571 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 572 573 for (final Pair<String, String> bigram : bigramWords) { 574 final int unigramProbability = unigramProbabilities.get(bigram.second); 575 final int bigramProbability = bigramProbabilities.get(bigram); 576 final int probability = binaryDictionary.calculateProbability(unigramProbability, 577 bigramProbability); 578 assertEquals(probability, 579 binaryDictionary.getBigramProbability(bigram.first, bigram.second)); 580 } 581 582 dictFile.delete(); 583 } 584 585 public void testRandomOperationsAndFlashWithGC() { 586 testRandomOperationsAndFlashWithGC(FormatSpec.VERSION4); 587 } 588 589 private void testRandomOperationsAndFlashWithGC(final int formatVersion) { 590 final int flashWithGCIterationCount = 50; 591 final int operationCountInEachIteration = 200; 592 final int initialUnigramCount = 100; 593 final float addUnigramProb = 0.5f; 594 final float addBigramProb = 0.8f; 595 final float removeBigramProb = 0.2f; 596 final int codePointSetSize = 30; 597 598 final long seed = System.currentTimeMillis(); 599 final Random random = new Random(seed); 600 601 File dictFile = null; 602 try { 603 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 604 } catch (IOException e) { 605 fail("IOException while writing an initial dictionary : " + e); 606 } 607 608 BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 609 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 610 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 611 final ArrayList<String> words = new ArrayList<String>(); 612 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); 613 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 614 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 615 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 616 new HashMap<Pair<String, String>, Integer>(); 617 for (int i = 0; i < initialUnigramCount; ++i) { 618 final String word = CodePointUtils.generateWord(random, codePointSet); 619 words.add(word); 620 final int unigramProbability = random.nextInt(0xFF); 621 unigramProbabilities.put(word, unigramProbability); 622 addUnigramWord(binaryDictionary, word, unigramProbability); 623 } 624 binaryDictionary.flushWithGC(); 625 binaryDictionary.close(); 626 627 for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { 628 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 629 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 630 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 631 for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { 632 // Add unigram. 633 if (random.nextFloat() < addUnigramProb) { 634 final String word = CodePointUtils.generateWord(random, codePointSet); 635 words.add(word); 636 final int unigramProbability = random.nextInt(0xFF); 637 unigramProbabilities.put(word, unigramProbability); 638 addUnigramWord(binaryDictionary, word, unigramProbability); 639 } 640 // Add bigram. 641 if (random.nextFloat() < addBigramProb && words.size() > 2) { 642 final int word0Index = random.nextInt(words.size()); 643 int word1Index = random.nextInt(words.size() - 1); 644 if (word0Index <= word1Index) { 645 word1Index++; 646 } 647 final String word0 = words.get(word0Index); 648 final String word1 = words.get(word1Index); 649 if (TextUtils.equals(word0, word1)) { 650 continue; 651 } 652 final int bigramProbability = random.nextInt(0xF); 653 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 654 bigramWords.add(bigram); 655 bigramProbabilities.put(bigram, bigramProbability); 656 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 657 } 658 // Remove bigram. 659 if (random.nextFloat() < removeBigramProb && !bigramWords.isEmpty()) { 660 final int bigramIndex = random.nextInt(bigramWords.size()); 661 final Pair<String, String> bigram = bigramWords.get(bigramIndex); 662 bigramWords.remove(bigramIndex); 663 bigramProbabilities.remove(bigram); 664 binaryDictionary.removeBigramWords(bigram.first, bigram.second); 665 } 666 } 667 668 // Test whether the all unigram operations are collectlly handled. 669 for (int i = 0; i < words.size(); i++) { 670 final String word = words.get(i); 671 final int unigramProbability = unigramProbabilities.get(word); 672 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 673 } 674 // Test whether the all bigram operations are collectlly handled. 675 for (int i = 0; i < bigramWords.size(); i++) { 676 final Pair<String, String> bigram = bigramWords.get(i); 677 final int unigramProbability = unigramProbabilities.get(bigram.second); 678 final int probability; 679 if (bigramProbabilities.containsKey(bigram)) { 680 final int bigramProbability = bigramProbabilities.get(bigram); 681 probability = binaryDictionary.calculateProbability(unigramProbability, 682 bigramProbability); 683 } else { 684 probability = Dictionary.NOT_A_PROBABILITY; 685 } 686 assertEquals(probability, 687 binaryDictionary.getBigramProbability(bigram.first, bigram.second)); 688 } 689 binaryDictionary.flushWithGC(); 690 binaryDictionary.close(); 691 } 692 693 dictFile.delete(); 694 } 695 696 public void testAddManyUnigramsAndFlushWithGC() { 697 testAddManyUnigramsAndFlushWithGC(FormatSpec.VERSION4); 698 } 699 700 private void testAddManyUnigramsAndFlushWithGC(final int formatVersion) { 701 final int flashWithGCIterationCount = 3; 702 final int codePointSetSize = 50; 703 704 final long seed = System.currentTimeMillis(); 705 final Random random = new Random(seed); 706 707 File dictFile = null; 708 try { 709 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 710 } catch (IOException e) { 711 fail("IOException while writing an initial dictionary : " + e); 712 } 713 714 final ArrayList<String> words = new ArrayList<String>(); 715 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 716 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 717 718 BinaryDictionary binaryDictionary; 719 for (int i = 0; i < flashWithGCIterationCount; i++) { 720 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 721 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 722 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 723 while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 724 final String word = CodePointUtils.generateWord(random, codePointSet); 725 words.add(word); 726 final int unigramProbability = random.nextInt(0xFF); 727 unigramProbabilities.put(word, unigramProbability); 728 addUnigramWord(binaryDictionary, word, unigramProbability); 729 } 730 731 for (int j = 0; j < words.size(); j++) { 732 final String word = words.get(j); 733 final int unigramProbability = unigramProbabilities.get(word); 734 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 735 } 736 737 binaryDictionary.flushWithGC(); 738 binaryDictionary.close(); 739 } 740 741 dictFile.delete(); 742 } 743 744 public void testUnigramAndBigramCount() { 745 testUnigramAndBigramCount(FormatSpec.VERSION4); 746 } 747 748 private void testUnigramAndBigramCount(final int formatVersion) { 749 final int flashWithGCIterationCount = 10; 750 final int codePointSetSize = 50; 751 final int unigramCountPerIteration = 1000; 752 final int bigramCountPerIteration = 2000; 753 final long seed = System.currentTimeMillis(); 754 final Random random = new Random(seed); 755 756 File dictFile = null; 757 try { 758 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 759 } catch (IOException e) { 760 fail("IOException while writing an initial dictionary : " + e); 761 } 762 763 final ArrayList<String> words = new ArrayList<String>(); 764 final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>(); 765 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 766 767 BinaryDictionary binaryDictionary; 768 for (int i = 0; i < flashWithGCIterationCount; i++) { 769 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 770 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 771 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 772 for (int j = 0; j < unigramCountPerIteration; j++) { 773 final String word = CodePointUtils.generateWord(random, codePointSet); 774 words.add(word); 775 final int unigramProbability = random.nextInt(0xFF); 776 addUnigramWord(binaryDictionary, word, unigramProbability); 777 } 778 for (int j = 0; j < bigramCountPerIteration; j++) { 779 final String word0 = words.get(random.nextInt(words.size())); 780 final String word1 = words.get(random.nextInt(words.size())); 781 if (TextUtils.equals(word0, word1)) { 782 continue; 783 } 784 bigrams.add(new Pair<String, String>(word0, word1)); 785 final int bigramProbability = random.nextInt(0xF); 786 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 787 } 788 assertEquals(new HashSet<String>(words).size(), Integer.parseInt( 789 binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY))); 790 assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt( 791 binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY))); 792 binaryDictionary.flushWithGC(); 793 assertEquals(new HashSet<String>(words).size(), Integer.parseInt( 794 binaryDictionary.getPropertyForTest(BinaryDictionary.UNIGRAM_COUNT_QUERY))); 795 assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt( 796 binaryDictionary.getPropertyForTest(BinaryDictionary.BIGRAM_COUNT_QUERY))); 797 binaryDictionary.close(); 798 } 799 800 dictFile.delete(); 801 } 802 803 public void testAddMultipleDictionaryEntries() { 804 testAddMultipleDictionaryEntries(FormatSpec.VERSION4); 805 } 806 807 private void testAddMultipleDictionaryEntries(final int formatVersion) { 808 final int codePointSetSize = 20; 809 final int lmParamCount = 1000; 810 final double bigramContinueRate = 0.9; 811 final long seed = System.currentTimeMillis(); 812 final Random random = new Random(seed); 813 814 File dictFile = null; 815 try { 816 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 817 } catch (IOException e) { 818 fail("IOException while writing an initial dictionary : " + e); 819 } 820 821 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 822 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 823 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 824 new HashMap<Pair<String, String>, Integer>(); 825 826 final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount]; 827 String prevWord = null; 828 for (int i = 0; i < languageModelParams.length; i++) { 829 final String word = CodePointUtils.generateWord(random, codePointSet); 830 final int probability = random.nextInt(0xFF); 831 final int bigramProbability = random.nextInt(0xF); 832 unigramProbabilities.put(word, probability); 833 if (prevWord == null) { 834 languageModelParams[i] = new LanguageModelParam(word, probability, 835 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 836 } else { 837 languageModelParams[i] = new LanguageModelParam(prevWord, word, probability, 838 bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP); 839 bigramProbabilities.put(new Pair<String, String>(prevWord, word), 840 bigramProbability); 841 } 842 prevWord = (random.nextDouble() < bigramContinueRate) ? word : null; 843 } 844 845 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 846 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 847 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 848 binaryDictionary.addMultipleDictionaryEntries(languageModelParams); 849 850 for (Map.Entry<String, Integer> entry : unigramProbabilities.entrySet()) { 851 assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey())); 852 } 853 854 for (Map.Entry<Pair<String, String>, Integer> entry : bigramProbabilities.entrySet()) { 855 final String word0 = entry.getKey().first; 856 final String word1 = entry.getKey().second; 857 final int unigramProbability = unigramProbabilities.get(word1); 858 final int bigramProbability = entry.getValue(); 859 final int probability = binaryDictionary.calculateProbability( 860 unigramProbability, bigramProbability); 861 assertEquals(probability, binaryDictionary.getBigramProbability(word0, word1)); 862 } 863 } 864 865 public void testGetWordProperties() { 866 testGetWordProperties(FormatSpec.VERSION4); 867 } 868 869 private void testGetWordProperties(final int formatVersion) { 870 final long seed = System.currentTimeMillis(); 871 final Random random = new Random(seed); 872 final int UNIGRAM_COUNT = 1000; 873 final int BIGRAM_COUNT = 1000; 874 final int codePointSetSize = 20; 875 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 876 877 File dictFile = null; 878 try { 879 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 880 } catch (IOException e) { 881 fail("IOException while writing an initial dictionary : " + e); 882 } 883 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 884 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 885 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 886 887 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); 888 assertFalse(invalidWordProperty.isValid()); 889 890 final ArrayList<String> words = new ArrayList<String>(); 891 final HashMap<String, Integer> wordProbabilities = new HashMap<String, Integer>(); 892 final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>(); 893 final HashMap<Pair<String, String>, Integer> bigramProbabilities = 894 new HashMap<Pair<String, String>, Integer>(); 895 896 for (int i = 0; i < UNIGRAM_COUNT; i++) { 897 final String word = CodePointUtils.generateWord(random, codePointSet); 898 final int unigramProbability = random.nextInt(0xFF); 899 final boolean isNotAWord = random.nextBoolean(); 900 final boolean isBlacklisted = random.nextBoolean(); 901 // TODO: Add tests for historical info. 902 binaryDictionary.addUnigramWord(word, unigramProbability, 903 null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY, 904 isNotAWord, isBlacklisted, BinaryDictionary.NOT_A_VALID_TIMESTAMP); 905 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 906 binaryDictionary.flushWithGC(); 907 } 908 words.add(word); 909 wordProbabilities.put(word, unigramProbability); 910 final WordProperty wordProperty = binaryDictionary.getWordProperty(word); 911 assertEquals(word, wordProperty.mWord); 912 assertTrue(wordProperty.isValid()); 913 assertEquals(isNotAWord, wordProperty.mIsNotAWord); 914 assertEquals(isBlacklisted, wordProperty.mIsBlacklistEntry); 915 assertEquals(false, wordProperty.mHasBigrams); 916 assertEquals(false, wordProperty.mHasShortcuts); 917 assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); 918 assertTrue(wordProperty.mShortcutTargets.isEmpty()); 919 } 920 921 for (int i = 0; i < BIGRAM_COUNT; i++) { 922 final int word0Index = random.nextInt(wordProbabilities.size()); 923 final int word1Index = random.nextInt(wordProbabilities.size()); 924 if (word0Index == word1Index) { 925 continue; 926 } 927 final String word0 = words.get(word0Index); 928 final String word1 = words.get(word1Index); 929 final int bigramProbability = random.nextInt(0xF); 930 binaryDictionary.addBigramWords(word0, word1, bigramProbability, 931 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 932 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 933 binaryDictionary.flushWithGC(); 934 } 935 if (!bigrams.containsKey(word0)) { 936 final HashSet<String> bigramWord1s = new HashSet<String>(); 937 bigrams.put(word0, bigramWord1s); 938 } 939 bigrams.get(word0).add(word1); 940 bigramProbabilities.put(new Pair<String, String>(word0, word1), bigramProbability); 941 } 942 943 for (int i = 0; i < words.size(); i++) { 944 final String word0 = words.get(i); 945 if (!bigrams.containsKey(word0)) { 946 continue; 947 } 948 final HashSet<String> bigramWord1s = bigrams.get(word0); 949 final WordProperty wordProperty = binaryDictionary.getWordProperty(word0); 950 assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size()); 951 for (int j = 0; j < wordProperty.mBigrams.size(); j++) { 952 final String word1 = wordProperty.mBigrams.get(j).mWord; 953 assertTrue(bigramWord1s.contains(word1)); 954 final int bigramProbabilityDelta = bigramProbabilities.get( 955 new Pair<String, String>(word0, word1)); 956 final int unigramProbability = wordProbabilities.get(word1); 957 final int bigramProbablity = binaryDictionary.calculateProbability( 958 unigramProbability, bigramProbabilityDelta); 959 assertEquals(wordProperty.mBigrams.get(j).getProbability(), bigramProbablity); 960 } 961 } 962 } 963 964 public void testIterateAllWords() { 965 testIterateAllWords(FormatSpec.VERSION4); 966 } 967 968 private void testIterateAllWords(final int formatVersion) { 969 final long seed = System.currentTimeMillis(); 970 final Random random = new Random(seed); 971 final int UNIGRAM_COUNT = 1000; 972 final int BIGRAM_COUNT = 1000; 973 final int codePointSetSize = 20; 974 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 975 976 File dictFile = null; 977 try { 978 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 979 } catch (IOException e) { 980 fail("IOException while writing an initial dictionary : " + e); 981 } 982 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 983 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 984 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 985 986 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); 987 assertFalse(invalidWordProperty.isValid()); 988 989 final ArrayList<String> words = new ArrayList<String>(); 990 final HashMap<String, Integer> wordProbabilitiesToCheckLater = 991 new HashMap<String, Integer>(); 992 final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>(); 993 final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater = 994 new HashMap<Pair<String, String>, Integer>(); 995 996 for (int i = 0; i < UNIGRAM_COUNT; i++) { 997 final String word = CodePointUtils.generateWord(random, codePointSet); 998 final int unigramProbability = random.nextInt(0xFF); 999 addUnigramWord(binaryDictionary, word, unigramProbability); 1000 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 1001 binaryDictionary.flushWithGC(); 1002 } 1003 words.add(word); 1004 wordProbabilitiesToCheckLater.put(word, unigramProbability); 1005 } 1006 1007 for (int i = 0; i < BIGRAM_COUNT; i++) { 1008 final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 1009 final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 1010 if (word0Index == word1Index) { 1011 continue; 1012 } 1013 final String word0 = words.get(word0Index); 1014 final String word1 = words.get(word1Index); 1015 final int bigramProbability = random.nextInt(0xF); 1016 binaryDictionary.addBigramWords(word0, word1, bigramProbability, 1017 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 1018 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 1019 binaryDictionary.flushWithGC(); 1020 } 1021 if (!bigrams.containsKey(word0)) { 1022 final HashSet<String> bigramWord1s = new HashSet<String>(); 1023 bigrams.put(word0, bigramWord1s); 1024 } 1025 bigrams.get(word0).add(word1); 1026 bigramProbabilitiesToCheckLater.put( 1027 new Pair<String, String>(word0, word1), bigramProbability); 1028 } 1029 1030 final HashSet<String> wordSet = new HashSet<String>(words); 1031 final HashSet<Pair<String, String>> bigramSet = 1032 new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet()); 1033 int token = 0; 1034 do { 1035 final BinaryDictionary.GetNextWordPropertyResult result = 1036 binaryDictionary.getNextWordProperty(token); 1037 final WordProperty wordProperty = result.mWordProperty; 1038 final String word0 = wordProperty.mWord; 1039 assertEquals((int)wordProbabilitiesToCheckLater.get(word0), 1040 wordProperty.mProbabilityInfo.mProbability); 1041 wordSet.remove(word0); 1042 final HashSet<String> bigramWord1s = bigrams.get(word0); 1043 for (int j = 0; j < wordProperty.mBigrams.size(); j++) { 1044 final String word1 = wordProperty.mBigrams.get(j).mWord; 1045 assertTrue(bigramWord1s.contains(word1)); 1046 final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); 1047 final Pair<String, String> bigram = new Pair<String, String>(word0, word1); 1048 final int bigramProbabilityDelta = bigramProbabilitiesToCheckLater.get(bigram); 1049 final int bigramProbablity = binaryDictionary.calculateProbability( 1050 unigramProbability, bigramProbabilityDelta); 1051 assertEquals(wordProperty.mBigrams.get(j).getProbability(), bigramProbablity); 1052 bigramSet.remove(bigram); 1053 } 1054 token = result.mNextToken; 1055 } while (token != 0); 1056 assertTrue(wordSet.isEmpty()); 1057 assertTrue(bigramSet.isEmpty()); 1058 } 1059 1060 public void testAddShortcuts() { 1061 testAddShortcuts(FormatSpec.VERSION4); 1062 } 1063 1064 private void testAddShortcuts(final int formatVersion) { 1065 File dictFile = null; 1066 try { 1067 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 1068 } catch (IOException e) { 1069 fail("IOException while writing an initial dictionary : " + e); 1070 } 1071 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1072 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1073 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1074 1075 final int unigramProbability = 100; 1076 final int shortcutProbability = 10; 1077 binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz", 1078 shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1079 0 /* timestamp */); 1080 WordProperty wordProperty = binaryDictionary.getWordProperty("aaa"); 1081 assertEquals(1, wordProperty.mShortcutTargets.size()); 1082 assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); 1083 assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability()); 1084 final int updatedShortcutProbability = 2; 1085 binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz", 1086 updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1087 0 /* timestamp */); 1088 wordProperty = binaryDictionary.getWordProperty("aaa"); 1089 assertEquals(1, wordProperty.mShortcutTargets.size()); 1090 assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); 1091 assertEquals(updatedShortcutProbability, 1092 wordProperty.mShortcutTargets.get(0).getProbability()); 1093 binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy", 1094 shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1095 0 /* timestamp */); 1096 final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>(); 1097 shortcutTargets.put("zzz", updatedShortcutProbability); 1098 shortcutTargets.put("yyy", shortcutProbability); 1099 wordProperty = binaryDictionary.getWordProperty("aaa"); 1100 assertEquals(2, wordProperty.mShortcutTargets.size()); 1101 for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { 1102 assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); 1103 assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), 1104 shortcutTarget.getProbability()); 1105 shortcutTargets.remove(shortcutTarget.mWord); 1106 } 1107 shortcutTargets.put("zzz", updatedShortcutProbability); 1108 shortcutTargets.put("yyy", shortcutProbability); 1109 binaryDictionary.flushWithGC(); 1110 wordProperty = binaryDictionary.getWordProperty("aaa"); 1111 assertEquals(2, wordProperty.mShortcutTargets.size()); 1112 for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { 1113 assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); 1114 assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), 1115 shortcutTarget.getProbability()); 1116 shortcutTargets.remove(shortcutTarget.mWord); 1117 } 1118 } 1119 1120 public void testAddManyShortcuts() { 1121 testAddManyShortcuts(FormatSpec.VERSION4); 1122 } 1123 1124 private void testAddManyShortcuts(final int formatVersion) { 1125 final long seed = System.currentTimeMillis(); 1126 final Random random = new Random(seed); 1127 final int UNIGRAM_COUNT = 1000; 1128 final int SHORTCUT_COUNT = 10000; 1129 final int codePointSetSize = 20; 1130 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 1131 1132 final ArrayList<String> words = new ArrayList<String>(); 1133 final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); 1134 final HashMap<String, HashMap<String, Integer>> shortcutTargets = 1135 new HashMap<String, HashMap<String, Integer>>(); 1136 1137 File dictFile = null; 1138 try { 1139 dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); 1140 } catch (IOException e) { 1141 fail("IOException while writing an initial dictionary : " + e); 1142 } 1143 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 1144 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 1145 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 1146 1147 for (int i = 0; i < UNIGRAM_COUNT; i++) { 1148 final String word = CodePointUtils.generateWord(random, codePointSet); 1149 final int unigramProbability = random.nextInt(0xFF); 1150 addUnigramWord(binaryDictionary, word, unigramProbability); 1151 words.add(word); 1152 unigramProbabilities.put(word, unigramProbability); 1153 if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 1154 binaryDictionary.flushWithGC(); 1155 } 1156 } 1157 for (int i = 0; i < SHORTCUT_COUNT; i++) { 1158 final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet); 1159 final int shortcutProbability = random.nextInt(0xF); 1160 final String word = words.get(random.nextInt(words.size())); 1161 final int unigramProbability = unigramProbabilities.get(word); 1162 binaryDictionary.addUnigramWord(word, unigramProbability, shortcutTarget, 1163 shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 1164 0 /* timestamp */); 1165 if (shortcutTargets.containsKey(word)) { 1166 final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word); 1167 shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability); 1168 } else { 1169 final HashMap<String, Integer> shortcutTargetsOfWord = 1170 new HashMap<String, Integer>(); 1171 shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability); 1172 shortcutTargets.put(word, shortcutTargetsOfWord); 1173 } 1174 if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 1175 binaryDictionary.flushWithGC(); 1176 } 1177 } 1178 1179 for (final String word : words) { 1180 final WordProperty wordProperty = binaryDictionary.getWordProperty(word); 1181 assertEquals((int)unigramProbabilities.get(word), 1182 wordProperty.mProbabilityInfo.mProbability); 1183 if (!shortcutTargets.containsKey(word)) { 1184 // The word does not have shortcut targets. 1185 continue; 1186 } 1187 assertEquals(shortcutTargets.get(word).size(), wordProperty.mShortcutTargets.size()); 1188 for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { 1189 final String targetCodePonts = shortcutTarget.mWord; 1190 assertEquals((int)shortcutTargets.get(word).get(targetCodePonts), 1191 shortcutTarget.getProbability()); 1192 } 1193 } 1194 } 1195} 1196