BinaryDictionary.java revision 2fa3693c264a4c150ac307d9bb7f6f8f18cc4ffc
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.text.TextUtils; 20import android.util.SparseArray; 21 22import com.android.inputmethod.annotations.UsedForTesting; 23import com.android.inputmethod.keyboard.ProximityInfo; 24import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; 25import com.android.inputmethod.latin.settings.NativeSuggestOptions; 26import com.android.inputmethod.latin.utils.CollectionUtils; 27import com.android.inputmethod.latin.utils.JniUtils; 28import com.android.inputmethod.latin.utils.StringUtils; 29import com.android.inputmethod.latin.utils.UnigramProperty; 30 31import java.io.File; 32import java.util.ArrayList; 33import java.util.Arrays; 34import java.util.Locale; 35import java.util.Map; 36 37/** 38 * Implements a static, compacted, binary dictionary of standard words. 39 */ 40// TODO: All methods which should be locked need to have a suffix "Locked". 41public final class BinaryDictionary extends Dictionary { 42 private static final String TAG = BinaryDictionary.class.getSimpleName(); 43 44 // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h 45 private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH; 46 // Must be equal to MAX_RESULTS in native/jni/src/defines.h 47 private static final int MAX_RESULTS = 18; 48 // The cutoff returned by native for auto-commit confidence. 49 // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h 50 private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000; 51 52 @UsedForTesting 53 public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 54 @UsedForTesting 55 public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 56 @UsedForTesting 57 public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 58 @UsedForTesting 59 public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 60 61 public static final int NOT_A_VALID_TIMESTAMP = -1; 62 63 // Format to get unigram flags from native side via getUnigramPropertyNative(). 64 private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT = 4; 65 private static final int FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX = 0; 66 private static final int FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX = 1; 67 private static final int FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX = 2; 68 private static final int FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX = 3; 69 70 // Format to get unigram historical info from native side via getUnigramPropertyNative(). 71 private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3; 72 private static final int FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX = 0; 73 private static final int FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX = 1; 74 private static final int FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX = 2; 75 76 private long mNativeDict; 77 private final Locale mLocale; 78 private final long mDictSize; 79 private final String mDictFilePath; 80 private final int[] mInputCodePoints = new int[MAX_WORD_LENGTH]; 81 private final int[] mOutputCodePoints = new int[MAX_WORD_LENGTH * MAX_RESULTS]; 82 private final int[] mSpaceIndices = new int[MAX_RESULTS]; 83 private final int[] mOutputScores = new int[MAX_RESULTS]; 84 private final int[] mOutputTypes = new int[MAX_RESULTS]; 85 // Only one result is ever used 86 private final int[] mOutputAutoCommitFirstWordConfidence = new int[1]; 87 88 private final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions(); 89 90 private final SparseArray<DicTraverseSession> mDicTraverseSessions = 91 CollectionUtils.newSparseArray(); 92 93 // TODO: There should be a way to remove used DicTraverseSession objects from 94 // {@code mDicTraverseSessions}. 95 private DicTraverseSession getTraverseSession(final int traverseSessionId) { 96 synchronized(mDicTraverseSessions) { 97 DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId); 98 if (traverseSession == null) { 99 traverseSession = mDicTraverseSessions.get(traverseSessionId); 100 if (traverseSession == null) { 101 traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize); 102 mDicTraverseSessions.put(traverseSessionId, traverseSession); 103 } 104 } 105 return traverseSession; 106 } 107 } 108 109 /** 110 * Constructor for the binary dictionary. This is supposed to be called from the 111 * dictionary factory. 112 * @param filename the name of the file to read through native code. 113 * @param offset the offset of the dictionary data within the file. 114 * @param length the length of the binary data. 115 * @param useFullEditDistance whether to use the full edit distance in suggestions 116 * @param dictType the dictionary type, as a human-readable string 117 * @param isUpdatable whether to open the dictionary file in writable mode. 118 */ 119 public BinaryDictionary(final String filename, final long offset, final long length, 120 final boolean useFullEditDistance, final Locale locale, final String dictType, 121 final boolean isUpdatable) { 122 super(dictType); 123 mLocale = locale; 124 mDictSize = length; 125 mDictFilePath = filename; 126 mNativeSuggestOptions.setUseFullEditDistance(useFullEditDistance); 127 loadDictionary(filename, offset, length, isUpdatable); 128 } 129 130 static { 131 JniUtils.loadNativeLibrary(); 132 } 133 134 private static native boolean createEmptyDictFileNative(String filePath, long dictVersion, 135 String[] attributeKeyStringArray, String[] attributeValueStringArray); 136 private static native long openNative(String sourceDir, long dictOffset, long dictSize, 137 boolean isUpdatable); 138 private static native void flushNative(long dict, String filePath); 139 private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC); 140 private static native void flushWithGCNative(long dict, String filePath); 141 private static native void closeNative(long dict); 142 private static native int getFormatVersionNative(long dict); 143 private static native int getProbabilityNative(long dict, int[] word); 144 private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1); 145 private static native void getUnigramPropertyNative(long dict, int[] word, 146 int[] outCodePoints, boolean[] outFlags, int[] outProbability, 147 int[] outHistoricalInfo, ArrayList<int[]> outShortcutTargets, 148 ArrayList<Integer> outShortcutProbabilities); 149 private static native int getSuggestionsNative(long dict, long proximityInfo, 150 long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, 151 int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint, 152 int[] suggestOptions, int[] prevWordCodePointArray, 153 int[] outputCodePoints, int[] outputScores, int[] outputIndices, int[] outputTypes, 154 int[] outputAutoCommitFirstWordConfidence); 155 private static native float calcNormalizedScoreNative(int[] before, int[] after, int score); 156 private static native int editDistanceNative(int[] before, int[] after); 157 private static native void addUnigramWordNative(long dict, int[] word, int probability, 158 int[] shortcutTarget, int shortcutProbability, boolean isNotAWord, 159 boolean isBlacklisted, int timestamp); 160 private static native void addBigramWordsNative(long dict, int[] word0, int[] word1, 161 int probability, int timestamp); 162 private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1); 163 private static native int addMultipleDictionaryEntriesNative(long dict, 164 LanguageModelParam[] languageModelParams, int startIndex); 165 private static native int calculateProbabilityNative(long dict, int unigramProbability, 166 int bigramProbability); 167 private static native String getPropertyNative(long dict, String query); 168 169 @UsedForTesting 170 public static boolean createEmptyDictFile(final String filePath, final long dictVersion, 171 final Map<String, String> attributeMap) { 172 final String[] keyArray = new String[attributeMap.size()]; 173 final String[] valueArray = new String[attributeMap.size()]; 174 int index = 0; 175 for (final String key : attributeMap.keySet()) { 176 keyArray[index] = key; 177 valueArray[index] = attributeMap.get(key); 178 index++; 179 } 180 return createEmptyDictFileNative(filePath, dictVersion, keyArray, valueArray); 181 } 182 183 // TODO: Move native dict into session 184 private final void loadDictionary(final String path, final long startOffset, 185 final long length, final boolean isUpdatable) { 186 mNativeDict = openNative(path, startOffset, length, isUpdatable); 187 } 188 189 @Override 190 public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, 191 final String prevWord, final ProximityInfo proximityInfo, 192 final boolean blockOffensiveWords, final int[] additionalFeaturesOptions) { 193 return getSuggestionsWithSessionId(composer, prevWord, proximityInfo, blockOffensiveWords, 194 additionalFeaturesOptions, 0 /* sessionId */); 195 } 196 197 @Override 198 public ArrayList<SuggestedWordInfo> getSuggestionsWithSessionId(final WordComposer composer, 199 final String prevWord, final ProximityInfo proximityInfo, 200 final boolean blockOffensiveWords, final int[] additionalFeaturesOptions, 201 final int sessionId) { 202 if (!isValidDictionary()) return null; 203 204 Arrays.fill(mInputCodePoints, Constants.NOT_A_CODE); 205 // TODO: toLowerCase in the native code 206 final int[] prevWordCodePointArray = (null == prevWord) 207 ? null : StringUtils.toCodePointArray(prevWord); 208 final int composerSize = composer.size(); 209 210 final boolean isGesture = composer.isBatchMode(); 211 if (composerSize <= 1 || !isGesture) { 212 if (composerSize > MAX_WORD_LENGTH - 1) return null; 213 for (int i = 0; i < composerSize; i++) { 214 mInputCodePoints[i] = composer.getCodeAt(i); 215 } 216 } 217 218 final InputPointers ips = composer.getInputPointers(); 219 final int inputSize = isGesture ? ips.getPointerSize() : composerSize; 220 mNativeSuggestOptions.setIsGesture(isGesture); 221 mNativeSuggestOptions.setAdditionalFeaturesOptions(additionalFeaturesOptions); 222 // proximityInfo and/or prevWordForBigrams may not be null. 223 final int count = getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(), 224 getTraverseSession(sessionId).getSession(), ips.getXCoordinates(), 225 ips.getYCoordinates(), ips.getTimes(), ips.getPointerIds(), mInputCodePoints, 226 inputSize, 0 /* commitPoint */, mNativeSuggestOptions.getOptions(), 227 prevWordCodePointArray, mOutputCodePoints, mOutputScores, mSpaceIndices, 228 mOutputTypes, mOutputAutoCommitFirstWordConfidence); 229 final ArrayList<SuggestedWordInfo> suggestions = CollectionUtils.newArrayList(); 230 for (int j = 0; j < count; ++j) { 231 final int start = j * MAX_WORD_LENGTH; 232 int len = 0; 233 while (len < MAX_WORD_LENGTH && mOutputCodePoints[start + len] != 0) { 234 ++len; 235 } 236 if (len > 0) { 237 final int flags = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_FLAGS; 238 if (blockOffensiveWords 239 && 0 != (flags & SuggestedWordInfo.KIND_FLAG_POSSIBLY_OFFENSIVE) 240 && 0 == (flags & SuggestedWordInfo.KIND_FLAG_EXACT_MATCH)) { 241 // If we block potentially offensive words, and if the word is possibly 242 // offensive, then we don't output it unless it's also an exact match. 243 continue; 244 } 245 final int kind = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_KIND; 246 final int score = SuggestedWordInfo.KIND_WHITELIST == kind 247 ? SuggestedWordInfo.MAX_SCORE : mOutputScores[j]; 248 // TODO: check that all users of the `kind' parameter are ready to accept 249 // flags too and pass mOutputTypes[j] instead of kind 250 suggestions.add(new SuggestedWordInfo(new String(mOutputCodePoints, start, len), 251 score, kind, this /* sourceDict */, 252 mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */, 253 mOutputAutoCommitFirstWordConfidence[0])); 254 } 255 } 256 return suggestions; 257 } 258 259 public boolean isValidDictionary() { 260 return mNativeDict != 0; 261 } 262 263 public int getFormatVersion() { 264 return getFormatVersionNative(mNativeDict); 265 } 266 267 public static float calcNormalizedScore(final String before, final String after, 268 final int score) { 269 return calcNormalizedScoreNative(StringUtils.toCodePointArray(before), 270 StringUtils.toCodePointArray(after), score); 271 } 272 273 public static int editDistance(final String before, final String after) { 274 if (before == null || after == null) { 275 throw new IllegalArgumentException(); 276 } 277 return editDistanceNative(StringUtils.toCodePointArray(before), 278 StringUtils.toCodePointArray(after)); 279 } 280 281 @Override 282 public boolean isValidWord(final String word) { 283 return getFrequency(word) != NOT_A_PROBABILITY; 284 } 285 286 @Override 287 public int getFrequency(final String word) { 288 if (word == null) return NOT_A_PROBABILITY; 289 int[] codePoints = StringUtils.toCodePointArray(word); 290 return getProbabilityNative(mNativeDict, codePoints); 291 } 292 293 // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni 294 // calls when checking for changes in an entire dictionary. 295 public boolean isValidBigram(final String word0, final String word1) { 296 return getBigramProbability(word0, word1) != NOT_A_PROBABILITY; 297 } 298 299 public int getBigramProbability(final String word0, final String word1) { 300 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) return NOT_A_PROBABILITY; 301 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 302 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 303 return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1); 304 } 305 306 @UsedForTesting 307 public UnigramProperty getUnigramProperty(final String word) { 308 if (TextUtils.isEmpty(word)) { 309 return null; 310 } 311 final int[] codePoints = StringUtils.toCodePointArray(word); 312 final int[] outCodePoints = new int[MAX_WORD_LENGTH]; 313 final boolean[] outFlags = new boolean[FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT]; 314 final int[] outProbability = new int[1]; 315 final int[] outHistoricalInfo = 316 new int[FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT]; 317 final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList(); 318 final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList(); 319 getUnigramPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability, 320 outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); 321 return new UnigramProperty(codePoints, 322 outFlags[FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX], 323 outFlags[FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX], 324 outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX], 325 outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0], 326 outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX], 327 outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX], 328 outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX], 329 outShortcutTargets, outShortcutProbabilities); 330 } 331 332 // Add a unigram entry to binary dictionary with unigram attributes in native code. 333 public void addUnigramWord(final String word, final int probability, 334 final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord, 335 final boolean isBlacklisted, final int timestamp) { 336 if (TextUtils.isEmpty(word)) { 337 return; 338 } 339 final int[] codePoints = StringUtils.toCodePointArray(word); 340 final int[] shortcutTargetCodePoints = (shortcutTarget != null) ? 341 StringUtils.toCodePointArray(shortcutTarget) : null; 342 addUnigramWordNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints, 343 shortcutProbability, isNotAWord, isBlacklisted, timestamp); 344 } 345 346 // Add a bigram entry to binary dictionary with timestamp in native code. 347 public void addBigramWords(final String word0, final String word1, final int probability, 348 final int timestamp) { 349 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) { 350 return; 351 } 352 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 353 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 354 addBigramWordsNative(mNativeDict, codePoints0, codePoints1, probability, timestamp); 355 } 356 357 // Remove a bigram entry form binary dictionary in native code. 358 public void removeBigramWords(final String word0, final String word1) { 359 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) { 360 return; 361 } 362 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 363 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 364 removeBigramWordsNative(mNativeDict, codePoints0, codePoints1); 365 } 366 367 public static class LanguageModelParam { 368 public final int[] mWord0; 369 public final int[] mWord1; 370 public final int[] mShortcutTarget; 371 public final int mUnigramProbability; 372 public final int mBigramProbability; 373 public final int mShortcutProbability; 374 public final boolean mIsNotAWord; 375 public final boolean mIsBlacklisted; 376 public final int mTimestamp; 377 378 // Constructor for unigram. 379 public LanguageModelParam(final String word, final int unigramProbability, 380 final int timestamp) { 381 mWord0 = null; 382 mWord1 = StringUtils.toCodePointArray(word); 383 mShortcutTarget = null; 384 mUnigramProbability = unigramProbability; 385 mBigramProbability = NOT_A_PROBABILITY; 386 mShortcutProbability = NOT_A_PROBABILITY; 387 mIsNotAWord = false; 388 mIsBlacklisted = false; 389 mTimestamp = timestamp; 390 } 391 392 // Constructor for unigram and bigram. 393 public LanguageModelParam(final String word0, final String word1, 394 final int unigramProbability, final int bigramProbability, 395 final int timestamp) { 396 mWord0 = StringUtils.toCodePointArray(word0); 397 mWord1 = StringUtils.toCodePointArray(word1); 398 mShortcutTarget = null; 399 mUnigramProbability = unigramProbability; 400 mBigramProbability = bigramProbability; 401 mShortcutProbability = NOT_A_PROBABILITY; 402 mIsNotAWord = false; 403 mIsBlacklisted = false; 404 mTimestamp = timestamp; 405 } 406 } 407 408 public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) { 409 if (!isValidDictionary()) return; 410 int processedParamCount = 0; 411 while (processedParamCount < languageModelParams.length) { 412 if (needsToRunGC(true /* mindsBlockByGC */)) { 413 flushWithGC(); 414 } 415 processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict, 416 languageModelParams, processedParamCount); 417 if (processedParamCount <= 0) { 418 return; 419 } 420 } 421 422 } 423 424 private void reopen() { 425 close(); 426 final File dictFile = new File(mDictFilePath); 427 // WARNING: Because we pass 0 as the offset and file.length() as the length, this can 428 // only be called for actual files. Right now it's only called by the flush() family of 429 // functions, which require an updatable dictionary, so it's okay. But beware. 430 loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */, 431 dictFile.length(), true /* isUpdatable */); 432 } 433 434 public void flush() { 435 if (!isValidDictionary()) return; 436 flushNative(mNativeDict, mDictFilePath); 437 reopen(); 438 } 439 440 public void flushWithGC() { 441 if (!isValidDictionary()) return; 442 flushWithGCNative(mNativeDict, mDictFilePath); 443 reopen(); 444 } 445 446 /** 447 * Checks whether GC is needed to run or not. 448 * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about 449 * the blocking in some situations such as in idle time or just before closing. 450 * @return whether GC is needed to run or not. 451 */ 452 public boolean needsToRunGC(final boolean mindsBlockByGC) { 453 if (!isValidDictionary()) return false; 454 return needsToRunGCNative(mNativeDict, mindsBlockByGC); 455 } 456 457 @UsedForTesting 458 public int calculateProbability(final int unigramProbability, final int bigramProbability) { 459 if (!isValidDictionary()) return NOT_A_PROBABILITY; 460 return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability); 461 } 462 463 @UsedForTesting 464 public String getPropertyForTests(String query) { 465 if (!isValidDictionary()) return ""; 466 return getPropertyNative(mNativeDict, query); 467 } 468 469 @Override 470 public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { 471 return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT; 472 } 473 474 @Override 475 public void close() { 476 synchronized (mDicTraverseSessions) { 477 final int sessionsSize = mDicTraverseSessions.size(); 478 for (int index = 0; index < sessionsSize; ++index) { 479 final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index); 480 if (traverseSession != null) { 481 traverseSession.close(); 482 } 483 } 484 mDicTraverseSessions.clear(); 485 } 486 closeInternalLocked(); 487 } 488 489 private synchronized void closeInternalLocked() { 490 if (mNativeDict != 0) { 491 closeNative(mNativeDict); 492 mNativeDict = 0; 493 } 494 } 495 496 // TODO: Manage BinaryDictionary instances without using WeakReference or something. 497 @Override 498 protected void finalize() throws Throwable { 499 try { 500 closeInternalLocked(); 501 } finally { 502 super.finalize(); 503 } 504 } 505} 506