1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.content.Context; 20import android.text.TextUtils; 21import android.util.Log; 22 23import com.android.inputmethod.keyboard.ProximityInfo; 24 25import java.io.File; 26import java.util.ArrayList; 27import java.util.Arrays; 28import java.util.HashMap; 29import java.util.HashSet; 30import java.util.Locale; 31import java.util.Map; 32import java.util.Set; 33 34/** 35 * This class loads a dictionary and provides a list of suggestions for a given sequence of 36 * characters. This includes corrections and completions. 37 */ 38public class Suggest implements Dictionary.WordCallback { 39 40 public static final String TAG = Suggest.class.getSimpleName(); 41 42 public static final int APPROX_MAX_WORD_LENGTH = 32; 43 44 public static final int CORRECTION_NONE = 0; 45 public static final int CORRECTION_BASIC = 1; 46 public static final int CORRECTION_FULL = 2; 47 public static final int CORRECTION_FULL_BIGRAM = 3; 48 49 /** 50 * Words that appear in both bigram and unigram data gets multiplier ranging from 51 * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the score from 52 * bigram data. 53 */ 54 public static final double BIGRAM_MULTIPLIER_MIN = 1.2; 55 public static final double BIGRAM_MULTIPLIER_MAX = 1.5; 56 57 /** 58 * Maximum possible bigram frequency. Will depend on how many bits are being used in data 59 * structure. Maximum bigram frequency will get the BIGRAM_MULTIPLIER_MAX as the multiplier. 60 */ 61 public static final int MAXIMUM_BIGRAM_FREQUENCY = 127; 62 63 // It seems the following values are only used for logging. 64 public static final int DIC_USER_TYPED = 0; 65 public static final int DIC_MAIN = 1; 66 public static final int DIC_USER = 2; 67 public static final int DIC_USER_UNIGRAM = 3; 68 public static final int DIC_CONTACTS = 4; 69 public static final int DIC_USER_BIGRAM = 5; 70 public static final int DIC_WHITELIST = 6; 71 // If you add a type of dictionary, increment DIC_TYPE_LAST_ID 72 // TODO: this value seems unused. Remove it? 73 public static final int DIC_TYPE_LAST_ID = 6; 74 public static final String DICT_KEY_MAIN = "main"; 75 public static final String DICT_KEY_CONTACTS = "contacts"; 76 // User dictionary, the system-managed one. 77 public static final String DICT_KEY_USER = "user"; 78 // User unigram dictionary, internal to LatinIME 79 public static final String DICT_KEY_USER_UNIGRAM = "user_unigram"; 80 // User bigram dictionary, internal to LatinIME 81 public static final String DICT_KEY_USER_BIGRAM = "user_bigram"; 82 public static final String DICT_KEY_WHITELIST ="whitelist"; 83 84 private static final boolean DBG = LatinImeLogger.sDBG; 85 86 private AutoCorrection mAutoCorrection; 87 88 private Dictionary mMainDict; 89 private ContactsDictionary mContactsDict; 90 private WhitelistDictionary mWhiteListDictionary; 91 private final Map<String, Dictionary> mUnigramDictionaries = new HashMap<String, Dictionary>(); 92 private final Map<String, Dictionary> mBigramDictionaries = new HashMap<String, Dictionary>(); 93 94 private int mPrefMaxSuggestions = 18; 95 96 private static final int PREF_MAX_BIGRAMS = 60; 97 98 private double mAutoCorrectionThreshold; 99 private int[] mScores = new int[mPrefMaxSuggestions]; 100 private int[] mBigramScores = new int[PREF_MAX_BIGRAMS]; 101 102 private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>(); 103 ArrayList<CharSequence> mBigramSuggestions = new ArrayList<CharSequence>(); 104 private CharSequence mTypedWord; 105 106 // TODO: Remove these member variables by passing more context to addWord() callback method 107 private boolean mIsFirstCharCapitalized; 108 private boolean mIsAllUpperCase; 109 110 private int mCorrectionMode = CORRECTION_BASIC; 111 112 public Suggest(final Context context, final int dictionaryResId, final Locale locale) { 113 initAsynchronously(context, dictionaryResId, locale); 114 } 115 116 /* package for test */ Suggest(final Context context, final File dictionary, 117 final long startOffset, final long length, final Flag[] flagArray, 118 final Locale locale) { 119 initSynchronously(null, DictionaryFactory.createDictionaryForTest(context, dictionary, 120 startOffset, length, flagArray), locale); 121 } 122 123 private void initWhitelistAndAutocorrectAndPool(final Context context, final Locale locale) { 124 mWhiteListDictionary = new WhitelistDictionary(context, locale); 125 addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_WHITELIST, mWhiteListDictionary); 126 mAutoCorrection = new AutoCorrection(); 127 StringBuilderPool.ensureCapacity(mPrefMaxSuggestions, getApproxMaxWordLength()); 128 } 129 130 private void initAsynchronously(final Context context, final int dictionaryResId, 131 final Locale locale) { 132 resetMainDict(context, dictionaryResId, locale); 133 134 // TODO: read the whitelist and init the pool asynchronously too. 135 // initPool should be done asynchronously now that the pool is thread-safe. 136 initWhitelistAndAutocorrectAndPool(context, locale); 137 } 138 139 private void initSynchronously(final Context context, final Dictionary mainDict, 140 final Locale locale) { 141 mMainDict = mainDict; 142 addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_MAIN, mainDict); 143 addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_MAIN, mainDict); 144 initWhitelistAndAutocorrectAndPool(context, locale); 145 } 146 147 private void addOrReplaceDictionary(Map<String, Dictionary> dictionaries, String key, 148 Dictionary dict) { 149 final Dictionary oldDict = (dict == null) 150 ? dictionaries.remove(key) 151 : dictionaries.put(key, dict); 152 if (oldDict != null && dict != oldDict) { 153 oldDict.close(); 154 } 155 } 156 157 public void resetMainDict(final Context context, final int dictionaryResId, 158 final Locale locale) { 159 mMainDict = null; 160 new Thread("InitializeBinaryDictionary") { 161 @Override 162 public void run() { 163 final Dictionary newMainDict = DictionaryFactory.createDictionaryFromManager( 164 context, locale, dictionaryResId); 165 mMainDict = newMainDict; 166 addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_MAIN, newMainDict); 167 addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_MAIN, newMainDict); 168 } 169 }.start(); 170 } 171 172 public int getCorrectionMode() { 173 return mCorrectionMode; 174 } 175 176 public void setCorrectionMode(int mode) { 177 mCorrectionMode = mode; 178 } 179 180 // The main dictionary could have been loaded asynchronously. Don't cache the return value 181 // of this method. 182 public boolean hasMainDictionary() { 183 return mMainDict != null; 184 } 185 186 public ContactsDictionary getContactsDictionary() { 187 return mContactsDict; 188 } 189 190 public Map<String, Dictionary> getUnigramDictionaries() { 191 return mUnigramDictionaries; 192 } 193 194 public int getApproxMaxWordLength() { 195 return APPROX_MAX_WORD_LENGTH; 196 } 197 198 /** 199 * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted 200 * before the main dictionary, if set. This refers to the system-managed user dictionary. 201 */ 202 public void setUserDictionary(Dictionary userDictionary) { 203 addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_USER, userDictionary); 204 } 205 206 /** 207 * Sets an optional contacts dictionary resource to be loaded. It is also possible to remove 208 * the contacts dictionary by passing null to this method. In this case no contacts dictionary 209 * won't be used. 210 */ 211 public void setContactsDictionary(ContactsDictionary contactsDictionary) { 212 mContactsDict = contactsDictionary; 213 addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_CONTACTS, contactsDictionary); 214 addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_CONTACTS, contactsDictionary); 215 } 216 217 public void setUserUnigramDictionary(Dictionary userUnigramDictionary) { 218 addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_USER_UNIGRAM, userUnigramDictionary); 219 } 220 221 public void setUserBigramDictionary(Dictionary userBigramDictionary) { 222 addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_USER_BIGRAM, userBigramDictionary); 223 } 224 225 public void setAutoCorrectionThreshold(double threshold) { 226 mAutoCorrectionThreshold = threshold; 227 } 228 229 public boolean isAggressiveAutoCorrectionMode() { 230 return (mAutoCorrectionThreshold == 0); 231 } 232 233 /** 234 * Number of suggestions to generate from the input key sequence. This has 235 * to be a number between 1 and 100 (inclusive). 236 * @param maxSuggestions 237 * @throws IllegalArgumentException if the number is out of range 238 */ 239 public void setMaxSuggestions(int maxSuggestions) { 240 if (maxSuggestions < 1 || maxSuggestions > 100) { 241 throw new IllegalArgumentException("maxSuggestions must be between 1 and 100"); 242 } 243 mPrefMaxSuggestions = maxSuggestions; 244 mScores = new int[mPrefMaxSuggestions]; 245 mBigramScores = new int[PREF_MAX_BIGRAMS]; 246 collectGarbage(mSuggestions, mPrefMaxSuggestions); 247 StringBuilderPool.ensureCapacity(mPrefMaxSuggestions, getApproxMaxWordLength()); 248 } 249 250 /** 251 * Returns a object which represents suggested words that match the list of character codes 252 * passed in. This object contents will be overwritten the next time this function is called. 253 * @param wordComposer contains what is currently being typed 254 * @param prevWordForBigram previous word (used only for bigram) 255 * @return suggested words object. 256 */ 257 public SuggestedWords getSuggestions(final WordComposer wordComposer, 258 final CharSequence prevWordForBigram, final ProximityInfo proximityInfo) { 259 return getSuggestedWordBuilder(wordComposer, prevWordForBigram, 260 proximityInfo).build(); 261 } 262 263 private CharSequence capitalizeWord(boolean all, boolean first, CharSequence word) { 264 if (TextUtils.isEmpty(word) || !(all || first)) return word; 265 final int wordLength = word.length(); 266 final StringBuilder sb = StringBuilderPool.getStringBuilder(getApproxMaxWordLength()); 267 // TODO: Must pay attention to locale when changing case. 268 if (all) { 269 sb.append(word.toString().toUpperCase()); 270 } else if (first) { 271 sb.append(Character.toUpperCase(word.charAt(0))); 272 if (wordLength > 1) { 273 sb.append(word.subSequence(1, wordLength)); 274 } 275 } 276 return sb; 277 } 278 279 protected void addBigramToSuggestions(CharSequence bigram) { 280 // TODO: Try to be a little more shrewd with resource allocation. 281 // At the moment we copy this object because the StringBuilders are pooled (see 282 // StringBuilderPool.java) and when we are finished using mSuggestions and 283 // mBigramSuggestions we will take everything from both and insert them back in the 284 // pool, so we can't allow the same object to be in both lists at the same time. 285 final StringBuilder sb = StringBuilderPool.getStringBuilder(getApproxMaxWordLength()); 286 sb.append(bigram); 287 mSuggestions.add(sb); 288 } 289 290 // TODO: cleanup dictionaries looking up and suggestions building with SuggestedWords.Builder 291 public SuggestedWords.Builder getSuggestedWordBuilder( 292 final WordComposer wordComposer, CharSequence prevWordForBigram, 293 final ProximityInfo proximityInfo) { 294 LatinImeLogger.onStartSuggestion(prevWordForBigram); 295 mAutoCorrection.init(); 296 mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized(); 297 mIsAllUpperCase = wordComposer.isAllUpperCase(); 298 collectGarbage(mSuggestions, mPrefMaxSuggestions); 299 Arrays.fill(mScores, 0); 300 301 // Save a lowercase version of the original word 302 String typedWord = wordComposer.getTypedWord(); 303 if (typedWord != null) { 304 // Treating USER_TYPED as UNIGRAM suggestion for logging now. 305 LatinImeLogger.onAddSuggestedWord(typedWord, Suggest.DIC_USER_TYPED, 306 Dictionary.DataType.UNIGRAM); 307 } 308 mTypedWord = typedWord; 309 310 if (wordComposer.size() <= 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM 311 || mCorrectionMode == CORRECTION_BASIC)) { 312 // At first character typed, search only the bigrams 313 Arrays.fill(mBigramScores, 0); 314 collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS); 315 316 if (!TextUtils.isEmpty(prevWordForBigram)) { 317 CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase(); 318 if (mMainDict != null && mMainDict.isValidWord(lowerPrevWord)) { 319 prevWordForBigram = lowerPrevWord; 320 } 321 for (final Dictionary dictionary : mBigramDictionaries.values()) { 322 dictionary.getBigrams(wordComposer, prevWordForBigram, this); 323 } 324 if (TextUtils.isEmpty(typedWord)) { 325 // Nothing entered: return all bigrams for the previous word 326 int insertCount = Math.min(mBigramSuggestions.size(), mPrefMaxSuggestions); 327 for (int i = 0; i < insertCount; ++i) { 328 addBigramToSuggestions(mBigramSuggestions.get(i)); 329 } 330 } else { 331 // Word entered: return only bigrams that match the first char of the typed word 332 @SuppressWarnings("null") 333 final char currentChar = typedWord.charAt(0); 334 // TODO: Must pay attention to locale when changing case. 335 final char currentCharUpper = Character.toUpperCase(currentChar); 336 int count = 0; 337 final int bigramSuggestionSize = mBigramSuggestions.size(); 338 for (int i = 0; i < bigramSuggestionSize; i++) { 339 final CharSequence bigramSuggestion = mBigramSuggestions.get(i); 340 final char bigramSuggestionFirstChar = bigramSuggestion.charAt(0); 341 if (bigramSuggestionFirstChar == currentChar 342 || bigramSuggestionFirstChar == currentCharUpper) { 343 addBigramToSuggestions(bigramSuggestion); 344 if (++count > mPrefMaxSuggestions) break; 345 } 346 } 347 } 348 } 349 350 } else if (wordComposer.size() > 1) { 351 // At second character typed, search the unigrams (scores being affected by bigrams) 352 for (final String key : mUnigramDictionaries.keySet()) { 353 // Skip UserUnigramDictionary and WhitelistDictionary to lookup 354 if (key.equals(DICT_KEY_USER_UNIGRAM) || key.equals(DICT_KEY_WHITELIST)) 355 continue; 356 final Dictionary dictionary = mUnigramDictionaries.get(key); 357 dictionary.getWords(wordComposer, this, proximityInfo); 358 } 359 } 360 final String typedWordString = typedWord == null ? null : typedWord.toString(); 361 362 CharSequence whitelistedWord = capitalizeWord(mIsAllUpperCase, mIsFirstCharCapitalized, 363 mWhiteListDictionary.getWhitelistedWord(typedWordString)); 364 365 mAutoCorrection.updateAutoCorrectionStatus(mUnigramDictionaries, wordComposer, 366 mSuggestions, mScores, typedWord, mAutoCorrectionThreshold, mCorrectionMode, 367 whitelistedWord); 368 369 if (whitelistedWord != null) { 370 mSuggestions.add(0, whitelistedWord); 371 } 372 373 if (typedWord != null) { 374 mSuggestions.add(0, typedWordString); 375 } 376 Utils.removeDupes(mSuggestions); 377 378 if (DBG) { 379 double normalizedScore = mAutoCorrection.getNormalizedScore(); 380 ArrayList<SuggestedWords.SuggestedWordInfo> scoreInfoList = 381 new ArrayList<SuggestedWords.SuggestedWordInfo>(); 382 scoreInfoList.add(new SuggestedWords.SuggestedWordInfo("+", false)); 383 for (int i = 0; i < mScores.length; ++i) { 384 if (normalizedScore > 0) { 385 final String scoreThreshold = String.format("%d (%4.2f)", mScores[i], 386 normalizedScore); 387 scoreInfoList.add( 388 new SuggestedWords.SuggestedWordInfo(scoreThreshold, false)); 389 normalizedScore = 0.0; 390 } else { 391 final String score = Integer.toString(mScores[i]); 392 scoreInfoList.add(new SuggestedWords.SuggestedWordInfo(score, false)); 393 } 394 } 395 for (int i = mScores.length; i < mSuggestions.size(); ++i) { 396 scoreInfoList.add(new SuggestedWords.SuggestedWordInfo("--", false)); 397 } 398 return new SuggestedWords.Builder().addWords(mSuggestions, scoreInfoList); 399 } 400 return new SuggestedWords.Builder().addWords(mSuggestions, null); 401 } 402 403 public boolean hasAutoCorrection() { 404 return mAutoCorrection.hasAutoCorrection(); 405 } 406 407 @Override 408 public boolean addWord(final char[] word, final int offset, final int length, int score, 409 final int dicTypeId, final Dictionary.DataType dataType) { 410 Dictionary.DataType dataTypeForLog = dataType; 411 final ArrayList<CharSequence> suggestions; 412 final int[] sortedScores; 413 final int prefMaxSuggestions; 414 if(dataType == Dictionary.DataType.BIGRAM) { 415 suggestions = mBigramSuggestions; 416 sortedScores = mBigramScores; 417 prefMaxSuggestions = PREF_MAX_BIGRAMS; 418 } else { 419 suggestions = mSuggestions; 420 sortedScores = mScores; 421 prefMaxSuggestions = mPrefMaxSuggestions; 422 } 423 424 int pos = 0; 425 426 // Check if it's the same word, only caps are different 427 if (Utils.equalsIgnoreCase(mTypedWord, word, offset, length)) { 428 // TODO: remove this surrounding if clause and move this logic to 429 // getSuggestedWordBuilder. 430 if (suggestions.size() > 0) { 431 final String currentHighestWord = suggestions.get(0).toString(); 432 // If the current highest word is also equal to typed word, we need to compare 433 // frequency to determine the insertion position. This does not ensure strictly 434 // correct ordering, but ensures the top score is on top which is enough for 435 // removing duplicates correctly. 436 if (Utils.equalsIgnoreCase(currentHighestWord, word, offset, length) 437 && score <= sortedScores[0]) { 438 pos = 1; 439 } 440 } 441 } else { 442 if (dataType == Dictionary.DataType.UNIGRAM) { 443 // Check if the word was already added before (by bigram data) 444 int bigramSuggestion = searchBigramSuggestion(word,offset,length); 445 if(bigramSuggestion >= 0) { 446 dataTypeForLog = Dictionary.DataType.BIGRAM; 447 // turn freq from bigram into multiplier specified above 448 double multiplier = (((double) mBigramScores[bigramSuggestion]) 449 / MAXIMUM_BIGRAM_FREQUENCY) 450 * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN) 451 + BIGRAM_MULTIPLIER_MIN; 452 /* Log.d(TAG,"bigram num: " + bigramSuggestion 453 + " wordB: " + mBigramSuggestions.get(bigramSuggestion).toString() 454 + " currentScore: " + score + " bigramScore: " 455 + mBigramScores[bigramSuggestion] 456 + " multiplier: " + multiplier); */ 457 score = (int)Math.round((score * multiplier)); 458 } 459 } 460 461 // Check the last one's score and bail 462 if (sortedScores[prefMaxSuggestions - 1] >= score) return true; 463 while (pos < prefMaxSuggestions) { 464 if (sortedScores[pos] < score 465 || (sortedScores[pos] == score && length < suggestions.get(pos).length())) { 466 break; 467 } 468 pos++; 469 } 470 } 471 if (pos >= prefMaxSuggestions) { 472 return true; 473 } 474 475 System.arraycopy(sortedScores, pos, sortedScores, pos + 1, prefMaxSuggestions - pos - 1); 476 sortedScores[pos] = score; 477 final StringBuilder sb = StringBuilderPool.getStringBuilder(getApproxMaxWordLength()); 478 // TODO: Must pay attention to locale when changing case. 479 if (mIsAllUpperCase) { 480 sb.append(new String(word, offset, length).toUpperCase()); 481 } else if (mIsFirstCharCapitalized) { 482 sb.append(Character.toUpperCase(word[offset])); 483 if (length > 1) { 484 sb.append(word, offset + 1, length - 1); 485 } 486 } else { 487 sb.append(word, offset, length); 488 } 489 suggestions.add(pos, sb); 490 if (suggestions.size() > prefMaxSuggestions) { 491 final CharSequence garbage = suggestions.remove(prefMaxSuggestions); 492 if (garbage instanceof StringBuilder) { 493 StringBuilderPool.recycle((StringBuilder)garbage); 494 } 495 } else { 496 LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog); 497 } 498 return true; 499 } 500 501 private int searchBigramSuggestion(final char[] word, final int offset, final int length) { 502 // TODO This is almost O(n^2). Might need fix. 503 // search whether the word appeared in bigram data 504 int bigramSuggestSize = mBigramSuggestions.size(); 505 for(int i = 0; i < bigramSuggestSize; i++) { 506 if(mBigramSuggestions.get(i).length() == length) { 507 boolean chk = true; 508 for(int j = 0; j < length; j++) { 509 if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) { 510 chk = false; 511 break; 512 } 513 } 514 if(chk) return i; 515 } 516 } 517 518 return -1; 519 } 520 521 private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) { 522 int poolSize = StringBuilderPool.getSize(); 523 int garbageSize = suggestions.size(); 524 while (poolSize < prefMaxSuggestions && garbageSize > 0) { 525 final CharSequence garbage = suggestions.get(garbageSize - 1); 526 if (garbage instanceof StringBuilder) { 527 StringBuilderPool.recycle((StringBuilder)garbage); 528 poolSize++; 529 } 530 garbageSize--; 531 } 532 if (poolSize == prefMaxSuggestions + 1) { 533 Log.w("Suggest", "String pool got too big: " + poolSize); 534 } 535 suggestions.clear(); 536 } 537 538 public void close() { 539 final Set<Dictionary> dictionaries = new HashSet<Dictionary>(); 540 dictionaries.addAll(mUnigramDictionaries.values()); 541 dictionaries.addAll(mBigramDictionaries.values()); 542 for (final Dictionary dictionary : dictionaries) { 543 dictionary.close(); 544 } 545 mMainDict = null; 546 } 547} 548