BinaryDictionary.java revision 38f341a2a53a04ce4195a0cb99fcb6e71203dec0
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.text.TextUtils; 20import android.util.SparseArray; 21 22import com.android.inputmethod.annotations.UsedForTesting; 23import com.android.inputmethod.keyboard.ProximityInfo; 24import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; 25import com.android.inputmethod.latin.makedict.Word; 26import com.android.inputmethod.latin.settings.NativeSuggestOptions; 27import com.android.inputmethod.latin.utils.CollectionUtils; 28import com.android.inputmethod.latin.utils.JniUtils; 29import com.android.inputmethod.latin.utils.LanguageModelParam; 30import com.android.inputmethod.latin.utils.StringUtils; 31import com.android.inputmethod.latin.utils.WordProperty; 32 33import java.io.File; 34import java.util.ArrayList; 35import java.util.Arrays; 36import java.util.Locale; 37import java.util.Map; 38 39/** 40 * Implements a static, compacted, binary dictionary of standard words. 41 */ 42// TODO: All methods which should be locked need to have a suffix "Locked". 43public final class BinaryDictionary extends Dictionary { 44 private static final String TAG = BinaryDictionary.class.getSimpleName(); 45 46 // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h 47 private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH; 48 // Must be equal to MAX_RESULTS in native/jni/src/defines.h 49 private static final int MAX_RESULTS = 18; 50 // The cutoff returned by native for auto-commit confidence. 51 // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h 52 private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000; 53 54 @UsedForTesting 55 public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 56 @UsedForTesting 57 public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 58 @UsedForTesting 59 public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 60 @UsedForTesting 61 public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 62 63 public static final int NOT_A_VALID_TIMESTAMP = -1; 64 65 // Format to get unigram flags from native side via getWordPropertyNative(). 66 private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 4; 67 private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0; 68 private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1; 69 private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2; 70 private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3; 71 72 // Format to get probability and historical info from native side via getWordPropertyNative(). 73 public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4; 74 public static final int FORMAT_WORD_PROPERTY_PROBABILITY_INDEX = 0; 75 public static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 1; 76 public static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 2; 77 public static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 3; 78 79 private long mNativeDict; 80 private final Locale mLocale; 81 private final long mDictSize; 82 private final String mDictFilePath; 83 private final int[] mInputCodePoints = new int[MAX_WORD_LENGTH]; 84 private final int[] mOutputCodePoints = new int[MAX_WORD_LENGTH * MAX_RESULTS]; 85 private final int[] mSpaceIndices = new int[MAX_RESULTS]; 86 private final int[] mOutputScores = new int[MAX_RESULTS]; 87 private final int[] mOutputTypes = new int[MAX_RESULTS]; 88 // Only one result is ever used 89 private final int[] mOutputAutoCommitFirstWordConfidence = new int[1]; 90 91 private final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions(); 92 93 private final SparseArray<DicTraverseSession> mDicTraverseSessions = 94 CollectionUtils.newSparseArray(); 95 96 // TODO: There should be a way to remove used DicTraverseSession objects from 97 // {@code mDicTraverseSessions}. 98 private DicTraverseSession getTraverseSession(final int traverseSessionId) { 99 synchronized(mDicTraverseSessions) { 100 DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId); 101 if (traverseSession == null) { 102 traverseSession = mDicTraverseSessions.get(traverseSessionId); 103 if (traverseSession == null) { 104 traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize); 105 mDicTraverseSessions.put(traverseSessionId, traverseSession); 106 } 107 } 108 return traverseSession; 109 } 110 } 111 112 /** 113 * Constructor for the binary dictionary. This is supposed to be called from the 114 * dictionary factory. 115 * @param filename the name of the file to read through native code. 116 * @param offset the offset of the dictionary data within the file. 117 * @param length the length of the binary data. 118 * @param useFullEditDistance whether to use the full edit distance in suggestions 119 * @param dictType the dictionary type, as a human-readable string 120 * @param isUpdatable whether to open the dictionary file in writable mode. 121 */ 122 public BinaryDictionary(final String filename, final long offset, final long length, 123 final boolean useFullEditDistance, final Locale locale, final String dictType, 124 final boolean isUpdatable) { 125 super(dictType); 126 mLocale = locale; 127 mDictSize = length; 128 mDictFilePath = filename; 129 mNativeSuggestOptions.setUseFullEditDistance(useFullEditDistance); 130 loadDictionary(filename, offset, length, isUpdatable); 131 } 132 133 static { 134 JniUtils.loadNativeLibrary(); 135 } 136 137 private static native boolean createEmptyDictFileNative(String filePath, long dictVersion, 138 String[] attributeKeyStringArray, String[] attributeValueStringArray); 139 private static native long openNative(String sourceDir, long dictOffset, long dictSize, 140 boolean isUpdatable); 141 private static native void flushNative(long dict, String filePath); 142 private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC); 143 private static native void flushWithGCNative(long dict, String filePath); 144 private static native void closeNative(long dict); 145 private static native int getFormatVersionNative(long dict); 146 private static native int getProbabilityNative(long dict, int[] word); 147 private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1); 148 private static native void getWordPropertyNative(long dict, int[] word, 149 int[] outCodePoints, boolean[] outFlags, int[] outProbabilityInfo, 150 ArrayList<int[]> outBigramTargets, ArrayList<int[]> outBigramProbabilityInfo, 151 ArrayList<int[]> outShortcutTargets, ArrayList<Integer> outShortcutProbabilities); 152 private static native int getNextWordNative(long dict, int token, int[] outCodePoints); 153 private static native int getSuggestionsNative(long dict, long proximityInfo, 154 long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, 155 int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint, 156 int[] suggestOptions, int[] prevWordCodePointArray, 157 int[] outputCodePoints, int[] outputScores, int[] outputIndices, int[] outputTypes, 158 int[] outputAutoCommitFirstWordConfidence); 159 private static native float calcNormalizedScoreNative(int[] before, int[] after, int score); 160 private static native int editDistanceNative(int[] before, int[] after); 161 private static native void addUnigramWordNative(long dict, int[] word, int probability, 162 int[] shortcutTarget, int shortcutProbability, boolean isNotAWord, 163 boolean isBlacklisted, int timestamp); 164 private static native void addBigramWordsNative(long dict, int[] word0, int[] word1, 165 int probability, int timestamp); 166 private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1); 167 private static native int addMultipleDictionaryEntriesNative(long dict, 168 LanguageModelParam[] languageModelParams, int startIndex); 169 private static native int calculateProbabilityNative(long dict, int unigramProbability, 170 int bigramProbability); 171 private static native int setCurrentTimeForTestNative(int currentTime); 172 private static native String getPropertyNative(long dict, String query); 173 174 @UsedForTesting 175 public static boolean createEmptyDictFile(final String filePath, final long dictVersion, 176 final Map<String, String> attributeMap) { 177 final String[] keyArray = new String[attributeMap.size()]; 178 final String[] valueArray = new String[attributeMap.size()]; 179 int index = 0; 180 for (final String key : attributeMap.keySet()) { 181 keyArray[index] = key; 182 valueArray[index] = attributeMap.get(key); 183 index++; 184 } 185 return createEmptyDictFileNative(filePath, dictVersion, keyArray, valueArray); 186 } 187 188 // TODO: Move native dict into session 189 private final void loadDictionary(final String path, final long startOffset, 190 final long length, final boolean isUpdatable) { 191 mNativeDict = openNative(path, startOffset, length, isUpdatable); 192 } 193 194 @Override 195 public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, 196 final String prevWord, final ProximityInfo proximityInfo, 197 final boolean blockOffensiveWords, final int[] additionalFeaturesOptions) { 198 return getSuggestionsWithSessionId(composer, prevWord, proximityInfo, blockOffensiveWords, 199 additionalFeaturesOptions, 0 /* sessionId */); 200 } 201 202 @Override 203 public ArrayList<SuggestedWordInfo> getSuggestionsWithSessionId(final WordComposer composer, 204 final String prevWord, final ProximityInfo proximityInfo, 205 final boolean blockOffensiveWords, final int[] additionalFeaturesOptions, 206 final int sessionId) { 207 if (!isValidDictionary()) return null; 208 209 Arrays.fill(mInputCodePoints, Constants.NOT_A_CODE); 210 // TODO: toLowerCase in the native code 211 final int[] prevWordCodePointArray = (null == prevWord) 212 ? null : StringUtils.toCodePointArray(prevWord); 213 final int composerSize = composer.size(); 214 215 final boolean isGesture = composer.isBatchMode(); 216 if (composerSize <= 1 || !isGesture) { 217 if (composerSize > MAX_WORD_LENGTH - 1) return null; 218 for (int i = 0; i < composerSize; i++) { 219 mInputCodePoints[i] = composer.getCodeAt(i); 220 } 221 } 222 223 final InputPointers ips = composer.getInputPointers(); 224 final int inputSize = isGesture ? ips.getPointerSize() : composerSize; 225 mNativeSuggestOptions.setIsGesture(isGesture); 226 mNativeSuggestOptions.setAdditionalFeaturesOptions(additionalFeaturesOptions); 227 // proximityInfo and/or prevWordForBigrams may not be null. 228 final int count = getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(), 229 getTraverseSession(sessionId).getSession(), ips.getXCoordinates(), 230 ips.getYCoordinates(), ips.getTimes(), ips.getPointerIds(), mInputCodePoints, 231 inputSize, 0 /* commitPoint */, mNativeSuggestOptions.getOptions(), 232 prevWordCodePointArray, mOutputCodePoints, mOutputScores, mSpaceIndices, 233 mOutputTypes, mOutputAutoCommitFirstWordConfidence); 234 final ArrayList<SuggestedWordInfo> suggestions = CollectionUtils.newArrayList(); 235 for (int j = 0; j < count; ++j) { 236 final int start = j * MAX_WORD_LENGTH; 237 int len = 0; 238 while (len < MAX_WORD_LENGTH && mOutputCodePoints[start + len] != 0) { 239 ++len; 240 } 241 if (len > 0) { 242 final int flags = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_FLAGS; 243 if (blockOffensiveWords 244 && 0 != (flags & SuggestedWordInfo.KIND_FLAG_POSSIBLY_OFFENSIVE) 245 && 0 == (flags & SuggestedWordInfo.KIND_FLAG_EXACT_MATCH)) { 246 // If we block potentially offensive words, and if the word is possibly 247 // offensive, then we don't output it unless it's also an exact match. 248 continue; 249 } 250 final int kind = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_KIND; 251 final int score = SuggestedWordInfo.KIND_WHITELIST == kind 252 ? SuggestedWordInfo.MAX_SCORE : mOutputScores[j]; 253 // TODO: check that all users of the `kind' parameter are ready to accept 254 // flags too and pass mOutputTypes[j] instead of kind 255 suggestions.add(new SuggestedWordInfo(new String(mOutputCodePoints, start, len), 256 score, kind, this /* sourceDict */, 257 mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */, 258 mOutputAutoCommitFirstWordConfidence[0])); 259 } 260 } 261 return suggestions; 262 } 263 264 public boolean isValidDictionary() { 265 return mNativeDict != 0; 266 } 267 268 public int getFormatVersion() { 269 return getFormatVersionNative(mNativeDict); 270 } 271 272 public static float calcNormalizedScore(final String before, final String after, 273 final int score) { 274 return calcNormalizedScoreNative(StringUtils.toCodePointArray(before), 275 StringUtils.toCodePointArray(after), score); 276 } 277 278 public static int editDistance(final String before, final String after) { 279 if (before == null || after == null) { 280 throw new IllegalArgumentException(); 281 } 282 return editDistanceNative(StringUtils.toCodePointArray(before), 283 StringUtils.toCodePointArray(after)); 284 } 285 286 @Override 287 public boolean isValidWord(final String word) { 288 return getFrequency(word) != NOT_A_PROBABILITY; 289 } 290 291 @Override 292 public int getFrequency(final String word) { 293 if (word == null) return NOT_A_PROBABILITY; 294 int[] codePoints = StringUtils.toCodePointArray(word); 295 return getProbabilityNative(mNativeDict, codePoints); 296 } 297 298 // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni 299 // calls when checking for changes in an entire dictionary. 300 public boolean isValidBigram(final String word0, final String word1) { 301 return getBigramProbability(word0, word1) != NOT_A_PROBABILITY; 302 } 303 304 public int getBigramProbability(final String word0, final String word1) { 305 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) return NOT_A_PROBABILITY; 306 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 307 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 308 return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1); 309 } 310 311 @UsedForTesting 312 public WordProperty getWordProperty(final String word) { 313 if (TextUtils.isEmpty(word)) { 314 return null; 315 } 316 final int[] codePoints = StringUtils.toCodePointArray(word); 317 final int[] outCodePoints = new int[MAX_WORD_LENGTH]; 318 final boolean[] outFlags = new boolean[FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT]; 319 final int[] outProbabilityInfo = 320 new int[FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT]; 321 final ArrayList<int[]> outBigramTargets = CollectionUtils.newArrayList(); 322 final ArrayList<int[]> outBigramProbabilityInfo = CollectionUtils.newArrayList(); 323 final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList(); 324 final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList(); 325 getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbabilityInfo, 326 outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, 327 outShortcutProbabilities); 328 return new WordProperty(codePoints, 329 outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX], 330 outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX], 331 outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX], 332 outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbabilityInfo, 333 outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, 334 outShortcutProbabilities); 335 } 336 337 public static class GetNextWordPropertyResult { 338 public WordProperty mWordProperty; 339 public int mNextToken; 340 341 public GetNextWordPropertyResult(final WordProperty wordPreperty, final int nextToken) { 342 mWordProperty = wordPreperty; 343 mNextToken = nextToken; 344 } 345 } 346 347 /** 348 * Method to iterate all words in the dictionary for makedict. 349 * If token is 0, this method newly starts iterating the dictionary. 350 */ 351 @UsedForTesting 352 public GetNextWordPropertyResult getNextWordProperty(final int token) { 353 final int[] codePoints = new int[MAX_WORD_LENGTH]; 354 final int nextToken = getNextWordNative(mNativeDict, token, codePoints); 355 int len = 0; 356 // codePoints is null-terminated if its length is shorter than the array length. 357 while (len < MAX_WORD_LENGTH && codePoints[len] != 0) { 358 ++len; 359 } 360 final String word = new String(mOutputCodePoints, 0, len); 361 return new GetNextWordPropertyResult(getWordProperty(word), nextToken); 362 } 363 364 // Add a unigram entry to binary dictionary with unigram attributes in native code. 365 public void addUnigramWord(final String word, final int probability, 366 final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord, 367 final boolean isBlacklisted, final int timestamp) { 368 if (TextUtils.isEmpty(word)) { 369 return; 370 } 371 final int[] codePoints = StringUtils.toCodePointArray(word); 372 final int[] shortcutTargetCodePoints = (shortcutTarget != null) ? 373 StringUtils.toCodePointArray(shortcutTarget) : null; 374 addUnigramWordNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints, 375 shortcutProbability, isNotAWord, isBlacklisted, timestamp); 376 } 377 378 // Add a bigram entry to binary dictionary with timestamp in native code. 379 public void addBigramWords(final String word0, final String word1, final int probability, 380 final int timestamp) { 381 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) { 382 return; 383 } 384 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 385 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 386 addBigramWordsNative(mNativeDict, codePoints0, codePoints1, probability, timestamp); 387 } 388 389 // Remove a bigram entry form binary dictionary in native code. 390 public void removeBigramWords(final String word0, final String word1) { 391 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) { 392 return; 393 } 394 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 395 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 396 removeBigramWordsNative(mNativeDict, codePoints0, codePoints1); 397 } 398 399 public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) { 400 if (!isValidDictionary()) return; 401 int processedParamCount = 0; 402 while (processedParamCount < languageModelParams.length) { 403 if (needsToRunGC(true /* mindsBlockByGC */)) { 404 flushWithGC(); 405 } 406 processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict, 407 languageModelParams, processedParamCount); 408 if (processedParamCount <= 0) { 409 return; 410 } 411 } 412 } 413 414 private void reopen() { 415 close(); 416 final File dictFile = new File(mDictFilePath); 417 // WARNING: Because we pass 0 as the offset and file.length() as the length, this can 418 // only be called for actual files. Right now it's only called by the flush() family of 419 // functions, which require an updatable dictionary, so it's okay. But beware. 420 loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */, 421 dictFile.length(), true /* isUpdatable */); 422 } 423 424 public void flush() { 425 if (!isValidDictionary()) return; 426 flushNative(mNativeDict, mDictFilePath); 427 reopen(); 428 } 429 430 public void flushWithGC() { 431 if (!isValidDictionary()) return; 432 flushWithGCNative(mNativeDict, mDictFilePath); 433 reopen(); 434 } 435 436 /** 437 * Checks whether GC is needed to run or not. 438 * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about 439 * the blocking in some situations such as in idle time or just before closing. 440 * @return whether GC is needed to run or not. 441 */ 442 public boolean needsToRunGC(final boolean mindsBlockByGC) { 443 if (!isValidDictionary()) return false; 444 return needsToRunGCNative(mNativeDict, mindsBlockByGC); 445 } 446 447 @UsedForTesting 448 public int calculateProbability(final int unigramProbability, final int bigramProbability) { 449 if (!isValidDictionary()) return NOT_A_PROBABILITY; 450 return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability); 451 } 452 453 /** 454 * Control the current time to be used in the native code. If currentTime >= 0, this method sets 455 * the current time and gets into test mode. 456 * In test mode, set timestamp is used as the current time in the native code. 457 * If currentTime < 0, quit the test mode and returns to using time() to get the current time. 458 * 459 * @param currentTime seconds since the unix epoch 460 * @return current time got in the native code. 461 */ 462 @UsedForTesting 463 public static int setCurrentTimeForTest(final int currentTime) { 464 return setCurrentTimeForTestNative(currentTime); 465 } 466 467 @UsedForTesting 468 public String getPropertyForTest(final String query) { 469 if (!isValidDictionary()) return ""; 470 return getPropertyNative(mNativeDict, query); 471 } 472 473 @Override 474 public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { 475 return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT; 476 } 477 478 @Override 479 public void close() { 480 synchronized (mDicTraverseSessions) { 481 final int sessionsSize = mDicTraverseSessions.size(); 482 for (int index = 0; index < sessionsSize; ++index) { 483 final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index); 484 if (traverseSession != null) { 485 traverseSession.close(); 486 } 487 } 488 mDicTraverseSessions.clear(); 489 } 490 closeInternalLocked(); 491 } 492 493 private synchronized void closeInternalLocked() { 494 if (mNativeDict != 0) { 495 closeNative(mNativeDict); 496 mNativeDict = 0; 497 } 498 } 499 500 // TODO: Manage BinaryDictionary instances without using WeakReference or something. 501 @Override 502 protected void finalize() throws Throwable { 503 try { 504 closeInternalLocked(); 505 } finally { 506 super.finalize(); 507 } 508 } 509} 510