Suggest.java revision 8553b5ec315660ab53dd9234e64e1e39ea09ec0f
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.content.Context; 20import android.text.AutoText; 21import android.text.TextUtils; 22import android.util.Log; 23import android.view.View; 24 25import java.util.ArrayList; 26import java.util.Arrays; 27 28/** 29 * This class loads a dictionary and provides a list of suggestions for a given sequence of 30 * characters. This includes corrections and completions. 31 */ 32public class Suggest implements Dictionary.WordCallback { 33 34 public static final String TAG = Suggest.class.getSimpleName(); 35 36 public static final int APPROX_MAX_WORD_LENGTH = 32; 37 38 public static final int CORRECTION_NONE = 0; 39 public static final int CORRECTION_BASIC = 1; 40 public static final int CORRECTION_FULL = 2; 41 public static final int CORRECTION_FULL_BIGRAM = 3; 42 43 /** 44 * Words that appear in both bigram and unigram data gets multiplier ranging from 45 * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the frequency score from 46 * bigram data. 47 */ 48 public static final double BIGRAM_MULTIPLIER_MIN = 1.2; 49 public static final double BIGRAM_MULTIPLIER_MAX = 1.5; 50 51 /** 52 * Maximum possible bigram frequency. Will depend on how many bits are being used in data 53 * structure. Maximum bigram freqeuncy will get the BIGRAM_MULTIPLIER_MAX as the multiplier. 54 */ 55 public static final int MAXIMUM_BIGRAM_FREQUENCY = 127; 56 57 public static final int DIC_USER_TYPED = 0; 58 public static final int DIC_MAIN = 1; 59 public static final int DIC_USER = 2; 60 public static final int DIC_AUTO = 3; 61 public static final int DIC_CONTACTS = 4; 62 // If you add a type of dictionary, increment DIC_TYPE_LAST_ID 63 public static final int DIC_TYPE_LAST_ID = 4; 64 65 static final int LARGE_DICTIONARY_THRESHOLD = 200 * 1000; 66 67 private static final boolean DBG = LatinImeLogger.sDBG; 68 69 private BinaryDictionary mMainDict; 70 71 private Dictionary mUserDictionary; 72 73 private Dictionary mAutoDictionary; 74 75 private Dictionary mContactsDictionary; 76 77 private Dictionary mUserBigramDictionary; 78 79 private int mPrefMaxSuggestions = 12; 80 81 private static final int PREF_MAX_BIGRAMS = 60; 82 83 private boolean mAutoTextEnabled; 84 85 private double mAutoCorrectionThreshold; 86 private int[] mPriorities = new int[mPrefMaxSuggestions]; 87 private int[] mBigramPriorities = new int[PREF_MAX_BIGRAMS]; 88 89 // Handle predictive correction for only the first 1280 characters for performance reasons 90 // If we support scripts that need latin characters beyond that, we should probably use some 91 // kind of a sparse array or language specific list with a mapping lookup table. 92 // 1280 is the size of the BASE_CHARS array in ExpandableDictionary, which is a basic set of 93 // latin characters. 94 private int[] mNextLettersFrequencies = new int[1280]; 95 private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>(); 96 ArrayList<CharSequence> mBigramSuggestions = new ArrayList<CharSequence>(); 97 private ArrayList<CharSequence> mStringPool = new ArrayList<CharSequence>(); 98 private boolean mHasAutoCorrection; 99 private String mLowerOriginalWord; 100 101 // TODO: Remove these member variables by passing more context to addWord() callback method 102 private boolean mIsFirstCharCapitalized; 103 private boolean mIsAllUpperCase; 104 105 private int mCorrectionMode = CORRECTION_BASIC; 106 107 public Suggest(Context context, int dictionaryResId) { 108 mMainDict = BinaryDictionary.initDictionary(context, dictionaryResId, DIC_MAIN); 109 initPool(); 110 } 111 112 private void initPool() { 113 for (int i = 0; i < mPrefMaxSuggestions; i++) { 114 StringBuilder sb = new StringBuilder(getApproxMaxWordLength()); 115 mStringPool.add(sb); 116 } 117 } 118 119 public void setAutoTextEnabled(boolean enabled) { 120 mAutoTextEnabled = enabled; 121 } 122 123 public int getCorrectionMode() { 124 return mCorrectionMode; 125 } 126 127 public void setCorrectionMode(int mode) { 128 mCorrectionMode = mode; 129 } 130 131 public boolean hasMainDictionary() { 132 return mMainDict != null && mMainDict.getSize() > LARGE_DICTIONARY_THRESHOLD; 133 } 134 135 public int getApproxMaxWordLength() { 136 return APPROX_MAX_WORD_LENGTH; 137 } 138 139 /** 140 * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted 141 * before the main dictionary, if set. 142 */ 143 public void setUserDictionary(Dictionary userDictionary) { 144 mUserDictionary = userDictionary; 145 } 146 147 /** 148 * Sets an optional contacts dictionary resource to be loaded. 149 */ 150 public void setContactsDictionary(Dictionary userDictionary) { 151 mContactsDictionary = userDictionary; 152 } 153 154 public void setAutoDictionary(Dictionary autoDictionary) { 155 mAutoDictionary = autoDictionary; 156 } 157 158 public void setUserBigramDictionary(Dictionary userBigramDictionary) { 159 mUserBigramDictionary = userBigramDictionary; 160 } 161 162 public void setAutoCorrectionThreshold(double threshold) { 163 mAutoCorrectionThreshold = threshold; 164 } 165 166 public boolean isAggressiveAutoCorrectionMode() { 167 return (mAutoCorrectionThreshold == 0); 168 } 169 170 /** 171 * Number of suggestions to generate from the input key sequence. This has 172 * to be a number between 1 and 100 (inclusive). 173 * @param maxSuggestions 174 * @throws IllegalArgumentException if the number is out of range 175 */ 176 public void setMaxSuggestions(int maxSuggestions) { 177 if (maxSuggestions < 1 || maxSuggestions > 100) { 178 throw new IllegalArgumentException("maxSuggestions must be between 1 and 100"); 179 } 180 mPrefMaxSuggestions = maxSuggestions; 181 mPriorities = new int[mPrefMaxSuggestions]; 182 mBigramPriorities = new int[PREF_MAX_BIGRAMS]; 183 collectGarbage(mSuggestions, mPrefMaxSuggestions); 184 while (mStringPool.size() < mPrefMaxSuggestions) { 185 StringBuilder sb = new StringBuilder(getApproxMaxWordLength()); 186 mStringPool.add(sb); 187 } 188 } 189 190 /** 191 * Returns a object which represents suggested words that match the list of character codes 192 * passed in. This object contents will be overwritten the next time this function is called. 193 * @param view a view for retrieving the context for AutoText 194 * @param wordComposer contains what is currently being typed 195 * @param prevWordForBigram previous word (used only for bigram) 196 * @return suggested words object. 197 */ 198 public SuggestedWords getSuggestions(View view, WordComposer wordComposer, 199 CharSequence prevWordForBigram) { 200 return getSuggestedWordBuilder(view, wordComposer, prevWordForBigram).build(); 201 } 202 203 // TODO: cleanup dictionaries looking up and suggestions building with SuggestedWords.Builder 204 public SuggestedWords.Builder getSuggestedWordBuilder(View view, WordComposer wordComposer, 205 CharSequence prevWordForBigram) { 206 LatinImeLogger.onStartSuggestion(prevWordForBigram); 207 mHasAutoCorrection = false; 208 mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized(); 209 mIsAllUpperCase = wordComposer.isAllUpperCase(); 210 collectGarbage(mSuggestions, mPrefMaxSuggestions); 211 Arrays.fill(mPriorities, 0); 212 Arrays.fill(mNextLettersFrequencies, 0); 213 214 // Save a lowercase version of the original word 215 CharSequence typedWord = wordComposer.getTypedWord(); 216 if (typedWord != null) { 217 final String typedWordString = typedWord.toString(); 218 typedWord = typedWordString; 219 mLowerOriginalWord = typedWordString.toLowerCase(); 220 // Treating USER_TYPED as UNIGRAM suggestion for logging now. 221 LatinImeLogger.onAddSuggestedWord(typedWordString, Suggest.DIC_USER_TYPED, 222 Dictionary.DataType.UNIGRAM); 223 } else { 224 mLowerOriginalWord = ""; 225 } 226 227 double normalizedScore = Integer.MIN_VALUE; 228 if (wordComposer.size() == 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM 229 || mCorrectionMode == CORRECTION_BASIC)) { 230 // At first character typed, search only the bigrams 231 Arrays.fill(mBigramPriorities, 0); 232 collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS); 233 234 if (!TextUtils.isEmpty(prevWordForBigram)) { 235 CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase(); 236 if (mMainDict != null && mMainDict.isValidWord(lowerPrevWord)) { 237 prevWordForBigram = lowerPrevWord; 238 } 239 if (mUserBigramDictionary != null) { 240 mUserBigramDictionary.getBigrams(wordComposer, prevWordForBigram, this, 241 mNextLettersFrequencies); 242 } 243 if (mContactsDictionary != null) { 244 mContactsDictionary.getBigrams(wordComposer, prevWordForBigram, this, 245 mNextLettersFrequencies); 246 } 247 if (mMainDict != null) { 248 mMainDict.getBigrams(wordComposer, prevWordForBigram, this, 249 mNextLettersFrequencies); 250 } 251 char currentChar = wordComposer.getTypedWord().charAt(0); 252 char currentCharUpper = Character.toUpperCase(currentChar); 253 int count = 0; 254 int bigramSuggestionSize = mBigramSuggestions.size(); 255 for (int i = 0; i < bigramSuggestionSize; i++) { 256 if (mBigramSuggestions.get(i).charAt(0) == currentChar 257 || mBigramSuggestions.get(i).charAt(0) == currentCharUpper) { 258 int poolSize = mStringPool.size(); 259 StringBuilder sb = poolSize > 0 ? 260 (StringBuilder) mStringPool.remove(poolSize - 1) 261 : new StringBuilder(getApproxMaxWordLength()); 262 sb.setLength(0); 263 sb.append(mBigramSuggestions.get(i)); 264 mSuggestions.add(count++, sb); 265 if (count > mPrefMaxSuggestions) break; 266 } 267 } 268 } 269 270 } else if (wordComposer.size() > 1) { 271 // At second character typed, search the unigrams (scores being affected by bigrams) 272 if (mUserDictionary != null || mContactsDictionary != null) { 273 if (mUserDictionary != null) { 274 mUserDictionary.getWords(wordComposer, this, mNextLettersFrequencies); 275 } 276 if (mContactsDictionary != null) { 277 mContactsDictionary.getWords(wordComposer, this, mNextLettersFrequencies); 278 } 279 280 if (mSuggestions.size() > 0 && isValidWord(typedWord) 281 && (mCorrectionMode == CORRECTION_FULL 282 || mCorrectionMode == CORRECTION_FULL_BIGRAM)) { 283 if (DBG) { 284 Log.d(TAG, "Auto corrected by CORRECTION_FULL."); 285 } 286 mHasAutoCorrection = true; 287 } 288 } 289 if (mMainDict != null) mMainDict.getWords(wordComposer, this, mNextLettersFrequencies); 290 if ((mCorrectionMode == CORRECTION_FULL || mCorrectionMode == CORRECTION_FULL_BIGRAM) 291 && mSuggestions.size() > 0 && mPriorities.length > 0) { 292 // TODO: when the normalized score of the first suggestion is nearly equals to 293 // the normalized score of the second suggestion, behave less aggressive. 294 normalizedScore = Utils.calcNormalizedScore( 295 typedWord, mSuggestions.get(0), mPriorities[0]); 296 if (DBG) { 297 Log.d(TAG, "Normalized " + typedWord + "," + mSuggestions.get(0) + "," 298 + mPriorities[0] + ", " + normalizedScore 299 + "(" + mAutoCorrectionThreshold + ")"); 300 } 301 if (normalizedScore >= mAutoCorrectionThreshold) { 302 if (DBG) { 303 Log.d(TAG, "Auto corrected by S-threthhold."); 304 } 305 mHasAutoCorrection = true; 306 } 307 } 308 } 309 if (typedWord != null) { 310 mSuggestions.add(0, typedWord.toString()); 311 } 312 if (mAutoTextEnabled) { 313 int i = 0; 314 int max = 6; 315 // Don't autotext the suggestions from the dictionaries 316 if (mCorrectionMode == CORRECTION_BASIC) max = 1; 317 while (i < mSuggestions.size() && i < max) { 318 String suggestedWord = mSuggestions.get(i).toString().toLowerCase(); 319 CharSequence autoText = 320 AutoText.get(suggestedWord, 0, suggestedWord.length(), view); 321 // Is there an AutoText (also known as Quick Fixes) correction? 322 boolean canAdd = autoText != null; 323 // Capitalize as needed 324 final int autoTextLength = autoText != null ? autoText.length() : 0; 325 if (autoTextLength > 0 && (mIsAllUpperCase || mIsFirstCharCapitalized)) { 326 int poolSize = mStringPool.size(); 327 StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove( 328 poolSize - 1) : new StringBuilder(getApproxMaxWordLength()); 329 sb.setLength(0); 330 if (mIsAllUpperCase) { 331 sb.append(autoText.toString().toUpperCase()); 332 } else if (mIsFirstCharCapitalized) { 333 sb.append(Character.toUpperCase(autoText.charAt(0))); 334 if (autoTextLength > 1) { 335 sb.append(autoText.subSequence(1, autoTextLength)); 336 } 337 } 338 autoText = sb.toString(); 339 } 340 // Is that correction already the current prediction (or original word)? 341 canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i)); 342 // Is that correction already the next predicted word? 343 if (canAdd && i + 1 < mSuggestions.size() && mCorrectionMode != CORRECTION_BASIC) { 344 canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i + 1)); 345 } 346 if (canAdd) { 347 if (DBG) { 348 Log.d(TAG, "Auto corrected by AUTOTEXT."); 349 } 350 mHasAutoCorrection = true; 351 mSuggestions.add(i + 1, autoText); 352 i++; 353 } 354 i++; 355 } 356 } 357 removeDupes(); 358 if (DBG) { 359 ArrayList<SuggestedWords.SuggestedWordInfo> frequencyInfoList = 360 new ArrayList<SuggestedWords.SuggestedWordInfo>(); 361 frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo("+", false)); 362 final int priorityLength = mPriorities.length; 363 for (int i = 0; i < priorityLength; ++i) { 364 if (normalizedScore > 0) { 365 final String priorityThreshold = Integer.toString(mPriorities[i]) + " (" + 366 normalizedScore + ")"; 367 frequencyInfoList.add( 368 new SuggestedWords.SuggestedWordInfo(priorityThreshold, false)); 369 normalizedScore = 0.0; 370 } else { 371 final String priority = Integer.toString(mPriorities[i]); 372 frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo(priority, false)); 373 } 374 } 375 for (int i = priorityLength; i < mSuggestions.size(); ++i) { 376 frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo("--", false)); 377 } 378 return new SuggestedWords.Builder().addWords(mSuggestions, frequencyInfoList); 379 } else { 380 return new SuggestedWords.Builder().addWords(mSuggestions, null); 381 } 382 } 383 384 public int[] getNextLettersFrequencies() { 385 return mNextLettersFrequencies; 386 } 387 388 private void removeDupes() { 389 final ArrayList<CharSequence> suggestions = mSuggestions; 390 if (suggestions.size() < 2) return; 391 int i = 1; 392 // Don't cache suggestions.size(), since we may be removing items 393 while (i < suggestions.size()) { 394 final CharSequence cur = suggestions.get(i); 395 // Compare each candidate with each previous candidate 396 for (int j = 0; j < i; j++) { 397 CharSequence previous = suggestions.get(j); 398 if (TextUtils.equals(cur, previous)) { 399 removeFromSuggestions(i); 400 i--; 401 break; 402 } 403 } 404 i++; 405 } 406 } 407 408 private void removeFromSuggestions(int index) { 409 CharSequence garbage = mSuggestions.remove(index); 410 if (garbage != null && garbage instanceof StringBuilder) { 411 mStringPool.add(garbage); 412 } 413 } 414 415 public boolean hasAutoCorrection() { 416 return mHasAutoCorrection; 417 } 418 419 private boolean compareCaseInsensitive(final String mLowerOriginalWord, 420 final char[] word, final int offset, final int length) { 421 final int originalLength = mLowerOriginalWord.length(); 422 if (originalLength == length && Character.isUpperCase(word[offset])) { 423 for (int i = 0; i < originalLength; i++) { 424 if (mLowerOriginalWord.charAt(i) != Character.toLowerCase(word[offset+i])) { 425 return false; 426 } 427 } 428 return true; 429 } 430 return false; 431 } 432 433 @Override 434 public boolean addWord(final char[] word, final int offset, final int length, int freq, 435 final int dicTypeId, final Dictionary.DataType dataType) { 436 Dictionary.DataType dataTypeForLog = dataType; 437 ArrayList<CharSequence> suggestions; 438 int[] priorities; 439 int prefMaxSuggestions; 440 if(dataType == Dictionary.DataType.BIGRAM) { 441 suggestions = mBigramSuggestions; 442 priorities = mBigramPriorities; 443 prefMaxSuggestions = PREF_MAX_BIGRAMS; 444 } else { 445 suggestions = mSuggestions; 446 priorities = mPriorities; 447 prefMaxSuggestions = mPrefMaxSuggestions; 448 } 449 450 int pos = 0; 451 452 // Check if it's the same word, only caps are different 453 if (compareCaseInsensitive(mLowerOriginalWord, word, offset, length)) { 454 pos = 0; 455 } else { 456 if (dataType == Dictionary.DataType.UNIGRAM) { 457 // Check if the word was already added before (by bigram data) 458 int bigramSuggestion = searchBigramSuggestion(word,offset,length); 459 if(bigramSuggestion >= 0) { 460 dataTypeForLog = Dictionary.DataType.BIGRAM; 461 // turn freq from bigram into multiplier specified above 462 double multiplier = (((double) mBigramPriorities[bigramSuggestion]) 463 / MAXIMUM_BIGRAM_FREQUENCY) 464 * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN) 465 + BIGRAM_MULTIPLIER_MIN; 466 /* Log.d(TAG,"bigram num: " + bigramSuggestion 467 + " wordB: " + mBigramSuggestions.get(bigramSuggestion).toString() 468 + " currentPriority: " + freq + " bigramPriority: " 469 + mBigramPriorities[bigramSuggestion] 470 + " multiplier: " + multiplier); */ 471 freq = (int)Math.round((freq * multiplier)); 472 } 473 } 474 475 // Check the last one's priority and bail 476 if (priorities[prefMaxSuggestions - 1] >= freq) return true; 477 while (pos < prefMaxSuggestions) { 478 if (priorities[pos] < freq 479 || (priorities[pos] == freq && length < suggestions.get(pos).length())) { 480 break; 481 } 482 pos++; 483 } 484 } 485 if (pos >= prefMaxSuggestions) { 486 return true; 487 } 488 489 System.arraycopy(priorities, pos, priorities, pos + 1, prefMaxSuggestions - pos - 1); 490 priorities[pos] = freq; 491 int poolSize = mStringPool.size(); 492 StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1) 493 : new StringBuilder(getApproxMaxWordLength()); 494 sb.setLength(0); 495 if (mIsAllUpperCase) { 496 sb.append(new String(word, offset, length).toUpperCase()); 497 } else if (mIsFirstCharCapitalized) { 498 sb.append(Character.toUpperCase(word[offset])); 499 if (length > 1) { 500 sb.append(word, offset + 1, length - 1); 501 } 502 } else { 503 sb.append(word, offset, length); 504 } 505 suggestions.add(pos, sb); 506 if (suggestions.size() > prefMaxSuggestions) { 507 CharSequence garbage = suggestions.remove(prefMaxSuggestions); 508 if (garbage instanceof StringBuilder) { 509 mStringPool.add(garbage); 510 } 511 } else { 512 LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog); 513 } 514 return true; 515 } 516 517 private int searchBigramSuggestion(final char[] word, final int offset, final int length) { 518 // TODO This is almost O(n^2). Might need fix. 519 // search whether the word appeared in bigram data 520 int bigramSuggestSize = mBigramSuggestions.size(); 521 for(int i = 0; i < bigramSuggestSize; i++) { 522 if(mBigramSuggestions.get(i).length() == length) { 523 boolean chk = true; 524 for(int j = 0; j < length; j++) { 525 if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) { 526 chk = false; 527 break; 528 } 529 } 530 if(chk) return i; 531 } 532 } 533 534 return -1; 535 } 536 537 public boolean isValidWord(final CharSequence word) { 538 if (word == null || word.length() == 0 || mMainDict == null) { 539 return false; 540 } 541 return mMainDict.isValidWord(word) 542 || (mUserDictionary != null && mUserDictionary.isValidWord(word)) 543 || (mAutoDictionary != null && mAutoDictionary.isValidWord(word)) 544 || (mContactsDictionary != null && mContactsDictionary.isValidWord(word)); 545 } 546 547 private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) { 548 int poolSize = mStringPool.size(); 549 int garbageSize = suggestions.size(); 550 while (poolSize < prefMaxSuggestions && garbageSize > 0) { 551 CharSequence garbage = suggestions.get(garbageSize - 1); 552 if (garbage != null && garbage instanceof StringBuilder) { 553 mStringPool.add(garbage); 554 poolSize++; 555 } 556 garbageSize--; 557 } 558 if (poolSize == prefMaxSuggestions + 1) { 559 Log.w("Suggest", "String pool got too big: " + poolSize); 560 } 561 suggestions.clear(); 562 } 563 564 public void close() { 565 if (mMainDict != null) { 566 mMainDict.close(); 567 mMainDict = null; 568 } 569 if (mUserDictionary != null) { 570 mUserDictionary.close(); 571 mUserDictionary = null; 572 } 573 if (mUserBigramDictionary != null) { 574 mUserBigramDictionary.close(); 575 mUserBigramDictionary = null; 576 } 577 if (mContactsDictionary != null) { 578 mContactsDictionary.close(); 579 mContactsDictionary = null; 580 } 581 if (mAutoDictionary != null) { 582 mAutoDictionary.close(); 583 mAutoDictionary = null; 584 } 585 } 586} 587