AndroidWordLevelSpellCheckerSession.java revision e708b1bc2e11285ad404133b8de21719ce08acb5
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.spellcheck; 18 19import android.content.ContentResolver; 20import android.database.ContentObserver; 21import android.os.Binder; 22import android.provider.UserDictionary.Words; 23import android.service.textservice.SpellCheckerService.Session; 24import android.text.TextUtils; 25import android.util.Log; 26import android.util.LruCache; 27import android.view.textservice.SuggestionsInfo; 28import android.view.textservice.TextInfo; 29 30import com.android.inputmethod.compat.SuggestionsInfoCompatUtils; 31import com.android.inputmethod.latin.Constants; 32import com.android.inputmethod.latin.Dictionary; 33import com.android.inputmethod.latin.PrevWordsInfo; 34import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; 35import com.android.inputmethod.latin.WordComposer; 36import com.android.inputmethod.latin.spellcheck.AndroidSpellCheckerService.SuggestionsGatherer; 37import com.android.inputmethod.latin.utils.CoordinateUtils; 38import com.android.inputmethod.latin.utils.LocaleUtils; 39import com.android.inputmethod.latin.utils.StringUtils; 40 41import java.util.ArrayList; 42import java.util.Locale; 43 44public abstract class AndroidWordLevelSpellCheckerSession extends Session { 45 private static final String TAG = AndroidWordLevelSpellCheckerSession.class.getSimpleName(); 46 private static final boolean DBG = false; 47 48 // Immutable, but need the locale which is not available in the constructor yet 49 private DictionaryPool mDictionaryPool; 50 // Likewise 51 private Locale mLocale; 52 // Cache this for performance 53 private int mScript; // One of SCRIPT_LATIN or SCRIPT_CYRILLIC for now. 54 private final AndroidSpellCheckerService mService; 55 protected final SuggestionsCache mSuggestionsCache = new SuggestionsCache(); 56 private final ContentObserver mObserver; 57 58 private static final class SuggestionsParams { 59 public final String[] mSuggestions; 60 public final int mFlags; 61 public SuggestionsParams(String[] suggestions, int flags) { 62 mSuggestions = suggestions; 63 mFlags = flags; 64 } 65 } 66 67 protected static final class SuggestionsCache { 68 private static final char CHAR_DELIMITER = '\uFFFC'; 69 private static final int MAX_CACHE_SIZE = 50; 70 private final LruCache<String, SuggestionsParams> mUnigramSuggestionsInfoCache = 71 new LruCache<>(MAX_CACHE_SIZE); 72 73 // TODO: Support n-gram input 74 private static String generateKey(final String query, final PrevWordsInfo prevWordsInfo) { 75 if (TextUtils.isEmpty(query) || !prevWordsInfo.isValid()) { 76 return query; 77 } 78 return query + CHAR_DELIMITER + prevWordsInfo; 79 } 80 81 public SuggestionsParams getSuggestionsFromCache(String query, 82 final PrevWordsInfo prevWordsInfo) { 83 return mUnigramSuggestionsInfoCache.get(generateKey(query, prevWordsInfo)); 84 } 85 86 public void putSuggestionsToCache( 87 final String query, final PrevWordsInfo prevWordsInfo, 88 final String[] suggestions, final int flags) { 89 if (suggestions == null || TextUtils.isEmpty(query)) { 90 return; 91 } 92 mUnigramSuggestionsInfoCache.put( 93 generateKey(query, prevWordsInfo), new SuggestionsParams(suggestions, flags)); 94 } 95 96 public void clearCache() { 97 mUnigramSuggestionsInfoCache.evictAll(); 98 } 99 } 100 101 AndroidWordLevelSpellCheckerSession(final AndroidSpellCheckerService service) { 102 mService = service; 103 final ContentResolver cres = service.getContentResolver(); 104 105 mObserver = new ContentObserver(null) { 106 @Override 107 public void onChange(boolean self) { 108 mSuggestionsCache.clearCache(); 109 } 110 }; 111 cres.registerContentObserver(Words.CONTENT_URI, true, mObserver); 112 } 113 114 @Override 115 public void onCreate() { 116 final String localeString = getLocale(); 117 mDictionaryPool = mService.getDictionaryPool(localeString); 118 mLocale = LocaleUtils.constructLocaleFromString(localeString); 119 mScript = AndroidSpellCheckerService.getScriptFromLocale(mLocale); 120 } 121 122 @Override 123 public void onClose() { 124 final ContentResolver cres = mService.getContentResolver(); 125 cres.unregisterContentObserver(mObserver); 126 } 127 128 /* 129 * Returns whether the code point is a letter that makes sense for the specified 130 * locale for this spell checker. 131 * The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml 132 * and is limited to EFIGS languages and Russian. 133 * Hence at the moment this explicitly tests for Cyrillic characters or Latin characters 134 * as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters. 135 */ 136 private static boolean isLetterCheckableByLanguage(final int codePoint, 137 final int script) { 138 switch (script) { 139 case AndroidSpellCheckerService.SCRIPT_LATIN: 140 // Our supported latin script dictionaries (EFIGS) at the moment only include 141 // characters in the C0, C1, Latin Extended A and B, IPA extensions unicode 142 // blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF, 143 // so the below is a very efficient way to test for it. As for the 0-0x3F, it's 144 // excluded from isLetter anyway. 145 return codePoint <= 0x2AF && Character.isLetter(codePoint); 146 case AndroidSpellCheckerService.SCRIPT_CYRILLIC: 147 // All Cyrillic characters are in the 400~52F block. There are some in the upper 148 // Unicode range, but they are archaic characters that are not used in modern 149 // Russian and are not used by our dictionary. 150 return codePoint >= 0x400 && codePoint <= 0x52F && Character.isLetter(codePoint); 151 case AndroidSpellCheckerService.SCRIPT_GREEK: 152 // Greek letters are either in the 370~3FF range (Greek & Coptic), or in the 153 // 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters. 154 // Our dictionary also contains a few words with 0xF2; it would be best to check 155 // if that's correct, but a web search does return results for these words so 156 // they are probably okay. 157 return (codePoint >= 0x370 && codePoint <= 0x3FF) 158 || (codePoint >= 0x1F00 && codePoint <= 0x1FFF) 159 || codePoint == 0xF2; 160 default: 161 // Should never come here 162 throw new RuntimeException("Impossible value of script: " + script); 163 } 164 } 165 166 private static final int CHECKABILITY_CHECKABLE = 0; 167 private static final int CHECKABILITY_TOO_MANY_NON_LETTERS = 1; 168 private static final int CHECKABILITY_CONTAINS_PERIOD = 2; 169 private static final int CHECKABILITY_EMAIL_OR_URL = 3; 170 private static final int CHECKABILITY_FIRST_LETTER_UNCHECKABLE = 4; 171 private static final int CHECKABILITY_TOO_SHORT = 5; 172 /** 173 * Finds out whether a particular string should be filtered out of spell checking. 174 * 175 * This will loosely match URLs, numbers, symbols. To avoid always underlining words that 176 * we know we will never recognize, this accepts a script identifier that should be one 177 * of the SCRIPT_* constants defined above, to rule out quickly characters from very 178 * different languages. 179 * 180 * @param text the string to evaluate. 181 * @param script the identifier for the script this spell checker recognizes 182 * @return one of the FILTER_OUT_* constants above. 183 */ 184 private static int getCheckabilityInScript(final String text, final int script) { 185 if (TextUtils.isEmpty(text) || text.length() <= 1) return CHECKABILITY_TOO_SHORT; 186 187 // TODO: check if an equivalent processing can't be done more quickly with a 188 // compiled regexp. 189 // Filter by first letter 190 final int firstCodePoint = text.codePointAt(0); 191 // Filter out words that don't start with a letter or an apostrophe 192 if (!isLetterCheckableByLanguage(firstCodePoint, script) 193 && '\'' != firstCodePoint) return CHECKABILITY_FIRST_LETTER_UNCHECKABLE; 194 195 // Filter contents 196 final int length = text.length(); 197 int letterCount = 0; 198 for (int i = 0; i < length; i = text.offsetByCodePoints(i, 1)) { 199 final int codePoint = text.codePointAt(i); 200 // Any word containing a COMMERCIAL_AT is probably an e-mail address 201 // Any word containing a SLASH is probably either an ad-hoc combination of two 202 // words or a URI - in either case we don't want to spell check that 203 if (Constants.CODE_COMMERCIAL_AT == codePoint || Constants.CODE_SLASH == codePoint) { 204 return CHECKABILITY_EMAIL_OR_URL; 205 } 206 // If the string contains a period, native returns strange suggestions (it seems 207 // to return suggestions for everything up to the period only and to ignore the 208 // rest), so we suppress lookup if there is a period. 209 // TODO: investigate why native returns these suggestions and remove this code. 210 if (Constants.CODE_PERIOD == codePoint) { 211 return CHECKABILITY_CONTAINS_PERIOD; 212 } 213 if (isLetterCheckableByLanguage(codePoint, script)) ++letterCount; 214 } 215 // Guestimate heuristic: perform spell checking if at least 3/4 of the characters 216 // in this word are letters 217 return (letterCount * 4 < length * 3) 218 ? CHECKABILITY_TOO_MANY_NON_LETTERS : CHECKABILITY_CHECKABLE; 219 } 220 221 /** 222 * Helper method to test valid capitalizations of a word. 223 * 224 * If the "text" is lower-case, we test only the exact string. 225 * If the "Text" is capitalized, we test the exact string "Text" and the lower-cased 226 * version of it "text". 227 * If the "TEXT" is fully upper case, we test the exact string "TEXT", the lower-cased 228 * version of it "text" and the capitalized version of it "Text". 229 */ 230 private boolean isInDictForAnyCapitalization(final Dictionary dict, final String text, 231 final int capitalizeType) { 232 // If the word is in there as is, then it's in the dictionary. If not, we'll test lower 233 // case versions, but only if the word is not already all-lower case or mixed case. 234 if (dict.isValidWord(text)) return true; 235 if (StringUtils.CAPITALIZE_NONE == capitalizeType) return false; 236 237 // If we come here, we have a capitalized word (either First- or All-). 238 // Downcase the word and look it up again. If the word is only capitalized, we 239 // tested all possibilities, so if it's still negative we can return false. 240 final String lowerCaseText = text.toLowerCase(mLocale); 241 if (dict.isValidWord(lowerCaseText)) return true; 242 if (StringUtils.CAPITALIZE_FIRST == capitalizeType) return false; 243 244 // If the lower case version is not in the dictionary, it's still possible 245 // that we have an all-caps version of a word that needs to be capitalized 246 // according to the dictionary. E.g. "GERMANS" only exists in the dictionary as "Germans". 247 return dict.isValidWord(StringUtils.capitalizeFirstAndDowncaseRest(lowerCaseText, mLocale)); 248 } 249 250 // Note : this must be reentrant 251 /** 252 * Gets a list of suggestions for a specific string. This returns a list of possible 253 * corrections for the text passed as an argument. It may split or group words, and 254 * even perform grammatical analysis. 255 */ 256 private SuggestionsInfo onGetSuggestionsInternal(final TextInfo textInfo, 257 final int suggestionsLimit) { 258 return onGetSuggestionsInternal(textInfo, null, suggestionsLimit); 259 } 260 261 protected SuggestionsInfo onGetSuggestionsInternal( 262 final TextInfo textInfo, final PrevWordsInfo prevWordsInfo, 263 final int suggestionsLimit) { 264 try { 265 final String inText = textInfo.getText(); 266 final SuggestionsParams cachedSuggestionsParams = 267 mSuggestionsCache.getSuggestionsFromCache(inText, prevWordsInfo); 268 if (cachedSuggestionsParams != null) { 269 if (DBG) { 270 Log.d(TAG, "Cache hit: " + inText + ", " + cachedSuggestionsParams.mFlags); 271 } 272 return new SuggestionsInfo( 273 cachedSuggestionsParams.mFlags, cachedSuggestionsParams.mSuggestions); 274 } 275 276 final int checkability = getCheckabilityInScript(inText, mScript); 277 if (CHECKABILITY_CHECKABLE != checkability) { 278 DictAndKeyboard dictInfo = null; 279 try { 280 dictInfo = mDictionaryPool.pollWithDefaultTimeout(); 281 if (!DictionaryPool.isAValidDictionary(dictInfo)) { 282 return AndroidSpellCheckerService.getNotInDictEmptySuggestions( 283 false /* reportAsTypo */); 284 } 285 if (CHECKABILITY_CONTAINS_PERIOD == checkability) { 286 final String[] splitText = inText.split(Constants.REGEXP_PERIOD); 287 boolean allWordsAreValid = true; 288 for (final String word : splitText) { 289 if (!dictInfo.mDictionary.isValidWord(word)) { 290 allWordsAreValid = false; 291 break; 292 } 293 } 294 if (allWordsAreValid) { 295 return new SuggestionsInfo(SuggestionsInfo.RESULT_ATTR_LOOKS_LIKE_TYPO 296 | SuggestionsInfo.RESULT_ATTR_HAS_RECOMMENDED_SUGGESTIONS, 297 new String[] { 298 TextUtils.join(Constants.STRING_SPACE, splitText) }); 299 } 300 } 301 return dictInfo.mDictionary.isValidWord(inText) 302 ? AndroidSpellCheckerService.getInDictEmptySuggestions() 303 : AndroidSpellCheckerService.getNotInDictEmptySuggestions( 304 CHECKABILITY_CONTAINS_PERIOD == checkability 305 /* reportAsTypo */); 306 } finally { 307 if (null != dictInfo) { 308 if (!mDictionaryPool.offer(dictInfo)) { 309 Log.e(TAG, "Can't re-insert a dictionary into its pool"); 310 } 311 } 312 } 313 } 314 final String text = inText.replaceAll( 315 AndroidSpellCheckerService.APOSTROPHE, AndroidSpellCheckerService.SINGLE_QUOTE); 316 317 // TODO: Don't gather suggestions if the limit is <= 0 unless necessary 318 //final SuggestionsGatherer suggestionsGatherer = new SuggestionsGatherer(text, 319 //mService.mSuggestionThreshold, mService.mRecommendedThreshold, 320 //suggestionsLimit); 321 final SuggestionsGatherer suggestionsGatherer = mService.newSuggestionsGatherer( 322 text, suggestionsLimit); 323 324 final int capitalizeType = StringUtils.getCapitalizationType(text); 325 boolean isInDict = true; 326 DictAndKeyboard dictInfo = null; 327 try { 328 dictInfo = mDictionaryPool.pollWithDefaultTimeout(); 329 if (!DictionaryPool.isAValidDictionary(dictInfo)) { 330 return AndroidSpellCheckerService.getNotInDictEmptySuggestions( 331 false /* reportAsTypo */); 332 } 333 final WordComposer composer = new WordComposer(); 334 final int[] codePoints = StringUtils.toCodePointArray(text); 335 final int[] coordinates; 336 if (null == dictInfo.mKeyboard) { 337 coordinates = CoordinateUtils.newCoordinateArray(codePoints.length, 338 Constants.NOT_A_COORDINATE, Constants.NOT_A_COORDINATE); 339 } else { 340 coordinates = dictInfo.mKeyboard.getCoordinates(codePoints); 341 } 342 composer.setComposingWord(codePoints, coordinates); 343 // TODO: make a spell checker option to block offensive words or not 344 final ArrayList<SuggestedWordInfo> suggestions = 345 dictInfo.mDictionary.getSuggestions(composer, prevWordsInfo, 346 dictInfo.getProximityInfo(), true /* blockOffensiveWords */, 347 null /* additionalFeaturesOptions */, 0 /* sessionId */, 348 null /* inOutLanguageWeight */); 349 if (suggestions != null) { 350 for (final SuggestedWordInfo suggestion : suggestions) { 351 final String suggestionStr = suggestion.mWord; 352 suggestionsGatherer.addWord(suggestionStr.toCharArray(), null, 0, 353 suggestionStr.length(), suggestion.mScore); 354 } 355 } 356 isInDict = isInDictForAnyCapitalization(dictInfo.mDictionary, text, capitalizeType); 357 } finally { 358 if (null != dictInfo) { 359 if (!mDictionaryPool.offer(dictInfo)) { 360 Log.e(TAG, "Can't re-insert a dictionary into its pool"); 361 } 362 } 363 } 364 365 final SuggestionsGatherer.Result result = suggestionsGatherer.getResults( 366 capitalizeType, mLocale); 367 368 if (DBG) { 369 Log.i(TAG, "Spell checking results for " + text + " with suggestion limit " 370 + suggestionsLimit); 371 Log.i(TAG, "IsInDict = " + isInDict); 372 Log.i(TAG, "LooksLikeTypo = " + (!isInDict)); 373 Log.i(TAG, "HasRecommendedSuggestions = " + result.mHasRecommendedSuggestions); 374 if (null != result.mSuggestions) { 375 for (String suggestion : result.mSuggestions) { 376 Log.i(TAG, suggestion); 377 } 378 } 379 } 380 381 final int flags = 382 (isInDict ? SuggestionsInfo.RESULT_ATTR_IN_THE_DICTIONARY 383 : SuggestionsInfo.RESULT_ATTR_LOOKS_LIKE_TYPO) 384 | (result.mHasRecommendedSuggestions 385 ? SuggestionsInfoCompatUtils 386 .getValueOf_RESULT_ATTR_HAS_RECOMMENDED_SUGGESTIONS() 387 : 0); 388 final SuggestionsInfo retval = new SuggestionsInfo(flags, result.mSuggestions); 389 mSuggestionsCache.putSuggestionsToCache(text, prevWordsInfo, result.mSuggestions, 390 flags); 391 return retval; 392 } catch (RuntimeException e) { 393 // Don't kill the keyboard if there is a bug in the spell checker 394 if (DBG) { 395 throw e; 396 } else { 397 Log.e(TAG, "Exception while spellcheking", e); 398 return AndroidSpellCheckerService.getNotInDictEmptySuggestions( 399 false /* reportAsTypo */); 400 } 401 } 402 } 403 404 /* 405 * The spell checker acts on its own behalf. That is needed, in particular, to be able to 406 * access the dictionary files, which the provider restricts to the identity of Latin IME. 407 * Since it's called externally by the application, the spell checker is using the identity 408 * of the application by default unless we clearCallingIdentity. 409 * That's what the following method does. 410 */ 411 @Override 412 public SuggestionsInfo onGetSuggestions(final TextInfo textInfo, 413 final int suggestionsLimit) { 414 long ident = Binder.clearCallingIdentity(); 415 try { 416 return onGetSuggestionsInternal(textInfo, suggestionsLimit); 417 } finally { 418 Binder.restoreCallingIdentity(ident); 419 } 420 } 421} 422