BinaryDictionary.java revision a245d15da5d295af21ead9a01583c64796a31ad7
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.text.TextUtils; 20import android.util.SparseArray; 21 22import com.android.inputmethod.annotations.UsedForTesting; 23import com.android.inputmethod.keyboard.ProximityInfo; 24import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; 25import com.android.inputmethod.latin.settings.NativeSuggestOptions; 26import com.android.inputmethod.latin.utils.CollectionUtils; 27import com.android.inputmethod.latin.utils.JniUtils; 28import com.android.inputmethod.latin.utils.StringUtils; 29import com.android.inputmethod.latin.utils.UnigramProperty; 30 31import java.io.File; 32import java.util.ArrayList; 33import java.util.Arrays; 34import java.util.Locale; 35import java.util.Map; 36 37/** 38 * Implements a static, compacted, binary dictionary of standard words. 39 */ 40// TODO: All methods which should be locked need to have a suffix "Locked". 41public final class BinaryDictionary extends Dictionary { 42 private static final String TAG = BinaryDictionary.class.getSimpleName(); 43 44 // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h 45 private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH; 46 // Must be equal to MAX_RESULTS in native/jni/src/defines.h 47 private static final int MAX_RESULTS = 18; 48 // The cutoff returned by native for auto-commit confidence. 49 // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h 50 private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000; 51 52 @UsedForTesting 53 public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 54 @UsedForTesting 55 public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 56 @UsedForTesting 57 public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 58 @UsedForTesting 59 public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 60 61 public static final int NOT_A_VALID_TIMESTAMP = -1; 62 63 // Format to get unigram flags from native side via getUnigramPropertyNative(). 64 private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT = 4; 65 private static final int FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX = 0; 66 private static final int FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX = 1; 67 private static final int FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX = 2; 68 private static final int FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX = 3; 69 70 // Format to get unigram historical info from native side via getUnigramPropertyNative(). 71 private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3; 72 private static final int FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX = 0; 73 private static final int FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX = 1; 74 private static final int FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX = 2; 75 76 private long mNativeDict; 77 private final Locale mLocale; 78 private final long mDictSize; 79 private final String mDictFilePath; 80 private final int[] mInputCodePoints = new int[MAX_WORD_LENGTH]; 81 private final int[] mOutputCodePoints = new int[MAX_WORD_LENGTH * MAX_RESULTS]; 82 private final int[] mSpaceIndices = new int[MAX_RESULTS]; 83 private final int[] mOutputScores = new int[MAX_RESULTS]; 84 private final int[] mOutputTypes = new int[MAX_RESULTS]; 85 // Only one result is ever used 86 private final int[] mOutputAutoCommitFirstWordConfidence = new int[1]; 87 88 private final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions(); 89 90 private final SparseArray<DicTraverseSession> mDicTraverseSessions = 91 CollectionUtils.newSparseArray(); 92 93 // TODO: There should be a way to remove used DicTraverseSession objects from 94 // {@code mDicTraverseSessions}. 95 private DicTraverseSession getTraverseSession(final int traverseSessionId) { 96 synchronized(mDicTraverseSessions) { 97 DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId); 98 if (traverseSession == null) { 99 traverseSession = mDicTraverseSessions.get(traverseSessionId); 100 if (traverseSession == null) { 101 traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize); 102 mDicTraverseSessions.put(traverseSessionId, traverseSession); 103 } 104 } 105 return traverseSession; 106 } 107 } 108 109 /** 110 * Constructor for the binary dictionary. This is supposed to be called from the 111 * dictionary factory. 112 * @param filename the name of the file to read through native code. 113 * @param offset the offset of the dictionary data within the file. 114 * @param length the length of the binary data. 115 * @param useFullEditDistance whether to use the full edit distance in suggestions 116 * @param dictType the dictionary type, as a human-readable string 117 * @param isUpdatable whether to open the dictionary file in writable mode. 118 */ 119 public BinaryDictionary(final String filename, final long offset, final long length, 120 final boolean useFullEditDistance, final Locale locale, final String dictType, 121 final boolean isUpdatable) { 122 super(dictType); 123 mLocale = locale; 124 mDictSize = length; 125 mDictFilePath = filename; 126 mNativeSuggestOptions.setUseFullEditDistance(useFullEditDistance); 127 loadDictionary(filename, offset, length, isUpdatable); 128 } 129 130 static { 131 JniUtils.loadNativeLibrary(); 132 } 133 134 private static native boolean createEmptyDictFileNative(String filePath, long dictVersion, 135 String[] attributeKeyStringArray, String[] attributeValueStringArray); 136 private static native long openNative(String sourceDir, long dictOffset, long dictSize, 137 boolean isUpdatable); 138 private static native void flushNative(long dict, String filePath); 139 private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC); 140 private static native void flushWithGCNative(long dict, String filePath); 141 private static native void closeNative(long dict); 142 private static native int getFormatVersionNative(long dict); 143 private static native int getProbabilityNative(long dict, int[] word); 144 private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1); 145 private static native void getUnigramPropertyNative(long dict, int[] word, 146 int[] outCodePoints, boolean[] outFlags, int[] outProbability, 147 int[] outHistoricalInfo, ArrayList<int[]> outShortcutTargets, 148 ArrayList<Integer> outShortcutProbabilities); 149 private static native int getSuggestionsNative(long dict, long proximityInfo, 150 long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, 151 int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint, 152 int[] suggestOptions, int[] prevWordCodePointArray, 153 int[] outputCodePoints, int[] outputScores, int[] outputIndices, int[] outputTypes, 154 int[] outputAutoCommitFirstWordConfidence); 155 private static native float calcNormalizedScoreNative(int[] before, int[] after, int score); 156 private static native int editDistanceNative(int[] before, int[] after); 157 private static native void addUnigramWordNative(long dict, int[] word, int probability, 158 int[] shortcutTarget, int shortcutProbability, boolean isNotAWord, 159 boolean isBlacklisted, int timestamp); 160 private static native void addBigramWordsNative(long dict, int[] word0, int[] word1, 161 int probability, int timestamp); 162 private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1); 163 private static native int addMultipleDictionaryEntriesNative(long dict, 164 LanguageModelParam[] languageModelParams, int startIndex); 165 private static native int calculateProbabilityNative(long dict, int unigramProbability, 166 int bigramProbability); 167 private static native String getPropertyNative(long dict, String query); 168 169 @UsedForTesting 170 public static boolean createEmptyDictFile(final String filePath, final long dictVersion, 171 final Map<String, String> attributeMap) { 172 final String[] keyArray = new String[attributeMap.size()]; 173 final String[] valueArray = new String[attributeMap.size()]; 174 int index = 0; 175 for (final String key : attributeMap.keySet()) { 176 keyArray[index] = key; 177 valueArray[index] = attributeMap.get(key); 178 index++; 179 } 180 return createEmptyDictFileNative(filePath, dictVersion, keyArray, valueArray); 181 } 182 183 // TODO: Move native dict into session 184 private final void loadDictionary(final String path, final long startOffset, 185 final long length, final boolean isUpdatable) { 186 mNativeDict = openNative(path, startOffset, length, isUpdatable); 187 } 188 189 @Override 190 public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, 191 final String prevWord, final ProximityInfo proximityInfo, 192 final boolean blockOffensiveWords, final int[] additionalFeaturesOptions) { 193 return getSuggestionsWithSessionId(composer, prevWord, proximityInfo, blockOffensiveWords, 194 additionalFeaturesOptions, 0 /* sessionId */); 195 } 196 197 @Override 198 public ArrayList<SuggestedWordInfo> getSuggestionsWithSessionId(final WordComposer composer, 199 final String prevWord, final ProximityInfo proximityInfo, 200 final boolean blockOffensiveWords, final int[] additionalFeaturesOptions, 201 final int sessionId) { 202 if (!isValidDictionary()) return null; 203 204 Arrays.fill(mInputCodePoints, Constants.NOT_A_CODE); 205 // TODO: toLowerCase in the native code 206 final int[] prevWordCodePointArray = (null == prevWord) 207 ? null : StringUtils.toCodePointArray(prevWord); 208 final int composerSize = composer.size(); 209 210 final boolean isGesture = composer.isBatchMode(); 211 if (composerSize <= 1 || !isGesture) { 212 if (composerSize > MAX_WORD_LENGTH - 1) return null; 213 for (int i = 0; i < composerSize; i++) { 214 mInputCodePoints[i] = composer.getCodeAt(i); 215 } 216 } 217 218 final InputPointers ips = composer.getInputPointers(); 219 final int inputSize = isGesture ? ips.getPointerSize() : composerSize; 220 mNativeSuggestOptions.setIsGesture(isGesture); 221 mNativeSuggestOptions.setAdditionalFeaturesOptions(additionalFeaturesOptions); 222 // proximityInfo and/or prevWordForBigrams may not be null. 223 final int count = getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(), 224 getTraverseSession(sessionId).getSession(), ips.getXCoordinates(), 225 ips.getYCoordinates(), ips.getTimes(), ips.getPointerIds(), mInputCodePoints, 226 inputSize, 0 /* commitPoint */, mNativeSuggestOptions.getOptions(), 227 prevWordCodePointArray, mOutputCodePoints, mOutputScores, mSpaceIndices, 228 mOutputTypes, mOutputAutoCommitFirstWordConfidence); 229 final ArrayList<SuggestedWordInfo> suggestions = CollectionUtils.newArrayList(); 230 for (int j = 0; j < count; ++j) { 231 final int start = j * MAX_WORD_LENGTH; 232 int len = 0; 233 while (len < MAX_WORD_LENGTH && mOutputCodePoints[start + len] != 0) { 234 ++len; 235 } 236 if (len > 0) { 237 final int flags = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_FLAGS; 238 if (blockOffensiveWords 239 && 0 != (flags & SuggestedWordInfo.KIND_FLAG_POSSIBLY_OFFENSIVE) 240 && 0 == (flags & SuggestedWordInfo.KIND_FLAG_EXACT_MATCH)) { 241 // If we block potentially offensive words, and if the word is possibly 242 // offensive, then we don't output it unless it's also an exact match. 243 continue; 244 } 245 final int kind = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_KIND; 246 final int score = SuggestedWordInfo.KIND_WHITELIST == kind 247 ? SuggestedWordInfo.MAX_SCORE : mOutputScores[j]; 248 // TODO: check that all users of the `kind' parameter are ready to accept 249 // flags too and pass mOutputTypes[j] instead of kind 250 suggestions.add(new SuggestedWordInfo(new String(mOutputCodePoints, start, len), 251 score, kind, this /* sourceDict */, 252 mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */, 253 mOutputAutoCommitFirstWordConfidence[0])); 254 } 255 } 256 return suggestions; 257 } 258 259 public boolean isValidDictionary() { 260 return mNativeDict != 0; 261 } 262 263 public int getFormatVersion() { 264 return getFormatVersionNative(mNativeDict); 265 } 266 267 public static float calcNormalizedScore(final String before, final String after, 268 final int score) { 269 return calcNormalizedScoreNative(StringUtils.toCodePointArray(before), 270 StringUtils.toCodePointArray(after), score); 271 } 272 273 public static int editDistance(final String before, final String after) { 274 if (before == null || after == null) { 275 throw new IllegalArgumentException(); 276 } 277 return editDistanceNative(StringUtils.toCodePointArray(before), 278 StringUtils.toCodePointArray(after)); 279 } 280 281 @Override 282 public boolean isValidWord(final String word) { 283 return getFrequency(word) != NOT_A_PROBABILITY; 284 } 285 286 @Override 287 public int getFrequency(final String word) { 288 if (word == null) return NOT_A_PROBABILITY; 289 int[] codePoints = StringUtils.toCodePointArray(word); 290 return getProbabilityNative(mNativeDict, codePoints); 291 } 292 293 // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni 294 // calls when checking for changes in an entire dictionary. 295 public boolean isValidBigram(final String word0, final String word1) { 296 return getBigramProbability(word0, word1) != NOT_A_PROBABILITY; 297 } 298 299 public int getBigramProbability(final String word0, final String word1) { 300 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) return NOT_A_PROBABILITY; 301 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 302 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 303 return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1); 304 } 305 306 @UsedForTesting 307 public UnigramProperty getUnigramProperty(final String word) { 308 if (TextUtils.isEmpty(word)) { 309 return null; 310 } 311 final int[] codePoints = StringUtils.toCodePointArray(word); 312 final int[] outCodePoints = new int[MAX_WORD_LENGTH]; 313 final boolean[] outFlags = new boolean[FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT]; 314 final int[] outProbability = new int[1]; 315 final int[] outHistoricalInfo = 316 new int[FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT]; 317 final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList(); 318 final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList(); 319 getUnigramPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability, 320 outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); 321 return new UnigramProperty(codePoints, 322 outFlags[FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX], 323 outFlags[FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX], 324 outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX], 325 outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0], 326 outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX], 327 outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX], 328 outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX], 329 outShortcutTargets, outShortcutProbabilities); 330 } 331 332 // Add a unigram entry to binary dictionary with unigram attributes in native code. 333 public void addUnigramWord(final String word, final int probability, 334 final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord, 335 final boolean isBlacklisted, final int timestamp) { 336 if (TextUtils.isEmpty(word)) { 337 return; 338 } 339 final int[] codePoints = StringUtils.toCodePointArray(word); 340 final int[] shortcutTargetCodePoints = (shortcutTarget != null) ? 341 StringUtils.toCodePointArray(shortcutTarget) : null; 342 addUnigramWordNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints, 343 shortcutProbability, isNotAWord, isBlacklisted, timestamp); 344 } 345 346 // Add a bigram entry to binary dictionary with timestamp in native code. 347 public void addBigramWords(final String word0, final String word1, final int probability, 348 final int timestamp) { 349 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) { 350 return; 351 } 352 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 353 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 354 addBigramWordsNative(mNativeDict, codePoints0, codePoints1, probability, timestamp); 355 } 356 357 // Remove a bigram entry form binary dictionary in native code. 358 public void removeBigramWords(final String word0, final String word1) { 359 if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) { 360 return; 361 } 362 final int[] codePoints0 = StringUtils.toCodePointArray(word0); 363 final int[] codePoints1 = StringUtils.toCodePointArray(word1); 364 removeBigramWordsNative(mNativeDict, codePoints0, codePoints1); 365 } 366 367 public static class LanguageModelParam { 368 public final int[] mWord0; 369 public final int[] mWord1; 370 // TODO: this needs to be a list of shortcuts 371 public final int[] mShortcutTarget; 372 public final int mUnigramProbability; 373 public final int mBigramProbability; 374 public final int mShortcutProbability; 375 public final boolean mIsNotAWord; 376 public final boolean mIsBlacklisted; 377 public final int mTimestamp; 378 379 // Constructor for unigram. TODO: support shortcuts 380 public LanguageModelParam(final String word, final int unigramProbability, 381 final int timestamp) { 382 mWord0 = null; 383 mWord1 = StringUtils.toCodePointArray(word); 384 mShortcutTarget = null; 385 mUnigramProbability = unigramProbability; 386 mBigramProbability = NOT_A_PROBABILITY; 387 mShortcutProbability = NOT_A_PROBABILITY; 388 mIsNotAWord = false; 389 mIsBlacklisted = false; 390 mTimestamp = timestamp; 391 } 392 393 // Constructor for unigram and bigram. 394 public LanguageModelParam(final String word0, final String word1, 395 final int unigramProbability, final int bigramProbability, 396 final int timestamp) { 397 mWord0 = StringUtils.toCodePointArray(word0); 398 mWord1 = StringUtils.toCodePointArray(word1); 399 mShortcutTarget = null; 400 mUnigramProbability = unigramProbability; 401 mBigramProbability = bigramProbability; 402 mShortcutProbability = NOT_A_PROBABILITY; 403 mIsNotAWord = false; 404 mIsBlacklisted = false; 405 mTimestamp = timestamp; 406 } 407 } 408 409 public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) { 410 if (!isValidDictionary()) return; 411 int processedParamCount = 0; 412 while (processedParamCount < languageModelParams.length) { 413 if (needsToRunGC(true /* mindsBlockByGC */)) { 414 flushWithGC(); 415 } 416 processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict, 417 languageModelParams, processedParamCount); 418 if (processedParamCount <= 0) { 419 return; 420 } 421 } 422 423 } 424 425 private void reopen() { 426 close(); 427 final File dictFile = new File(mDictFilePath); 428 // WARNING: Because we pass 0 as the offset and file.length() as the length, this can 429 // only be called for actual files. Right now it's only called by the flush() family of 430 // functions, which require an updatable dictionary, so it's okay. But beware. 431 loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */, 432 dictFile.length(), true /* isUpdatable */); 433 } 434 435 public void flush() { 436 if (!isValidDictionary()) return; 437 flushNative(mNativeDict, mDictFilePath); 438 reopen(); 439 } 440 441 public void flushWithGC() { 442 if (!isValidDictionary()) return; 443 flushWithGCNative(mNativeDict, mDictFilePath); 444 reopen(); 445 } 446 447 /** 448 * Checks whether GC is needed to run or not. 449 * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about 450 * the blocking in some situations such as in idle time or just before closing. 451 * @return whether GC is needed to run or not. 452 */ 453 public boolean needsToRunGC(final boolean mindsBlockByGC) { 454 if (!isValidDictionary()) return false; 455 return needsToRunGCNative(mNativeDict, mindsBlockByGC); 456 } 457 458 @UsedForTesting 459 public int calculateProbability(final int unigramProbability, final int bigramProbability) { 460 if (!isValidDictionary()) return NOT_A_PROBABILITY; 461 return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability); 462 } 463 464 @UsedForTesting 465 public String getPropertyForTests(String query) { 466 if (!isValidDictionary()) return ""; 467 return getPropertyNative(mNativeDict, query); 468 } 469 470 @Override 471 public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { 472 return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT; 473 } 474 475 @Override 476 public void close() { 477 synchronized (mDicTraverseSessions) { 478 final int sessionsSize = mDicTraverseSessions.size(); 479 for (int index = 0; index < sessionsSize; ++index) { 480 final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index); 481 if (traverseSession != null) { 482 traverseSession.close(); 483 } 484 } 485 mDicTraverseSessions.clear(); 486 } 487 closeInternalLocked(); 488 } 489 490 private synchronized void closeInternalLocked() { 491 if (mNativeDict != 0) { 492 closeNative(mNativeDict); 493 mNativeDict = 0; 494 } 495 } 496 497 // TODO: Manage BinaryDictionary instances without using WeakReference or something. 498 @Override 499 protected void finalize() throws Throwable { 500 try { 501 closeInternalLocked(); 502 } finally { 503 super.finalize(); 504 } 505 } 506} 507