AndroidWordLevelSpellCheckerSession.java revision a91561aa58db1c43092c1caecc051a11fa5391c7
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.spellcheck; 18 19import android.content.ContentResolver; 20import android.database.ContentObserver; 21import android.os.Binder; 22import android.provider.UserDictionary.Words; 23import android.service.textservice.SpellCheckerService.Session; 24import android.text.TextUtils; 25import android.util.Log; 26import android.util.LruCache; 27import android.view.textservice.SuggestionsInfo; 28import android.view.textservice.TextInfo; 29 30import com.android.inputmethod.compat.SuggestionsInfoCompatUtils; 31import com.android.inputmethod.latin.Constants; 32import com.android.inputmethod.latin.Dictionary; 33import com.android.inputmethod.latin.PrevWordsInfo; 34import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; 35import com.android.inputmethod.latin.WordComposer; 36import com.android.inputmethod.latin.spellcheck.AndroidSpellCheckerService.SuggestionsGatherer; 37import com.android.inputmethod.latin.utils.CoordinateUtils; 38import com.android.inputmethod.latin.utils.LocaleUtils; 39import com.android.inputmethod.latin.utils.StringUtils; 40 41import java.util.ArrayList; 42import java.util.Locale; 43 44public abstract class AndroidWordLevelSpellCheckerSession extends Session { 45 private static final String TAG = AndroidWordLevelSpellCheckerSession.class.getSimpleName(); 46 private static final boolean DBG = false; 47 48 // Immutable, but need the locale which is not available in the constructor yet 49 private DictionaryPool mDictionaryPool; 50 // Likewise 51 private Locale mLocale; 52 // Cache this for performance 53 private int mScript; // One of SCRIPT_LATIN or SCRIPT_CYRILLIC for now. 54 private final AndroidSpellCheckerService mService; 55 protected final SuggestionsCache mSuggestionsCache = new SuggestionsCache(); 56 private final ContentObserver mObserver; 57 58 private static final class SuggestionsParams { 59 public final String[] mSuggestions; 60 public final int mFlags; 61 public SuggestionsParams(String[] suggestions, int flags) { 62 mSuggestions = suggestions; 63 mFlags = flags; 64 } 65 } 66 67 protected static final class SuggestionsCache { 68 private static final char CHAR_DELIMITER = '\uFFFC'; 69 private static final int MAX_CACHE_SIZE = 50; 70 private final LruCache<String, SuggestionsParams> mUnigramSuggestionsInfoCache = 71 new LruCache<>(MAX_CACHE_SIZE); 72 73 // TODO: Support n-gram input 74 private static String generateKey(final String query, final PrevWordsInfo prevWordsInfo) { 75 if (TextUtils.isEmpty(query) || TextUtils.isEmpty(prevWordsInfo.mPrevWord)) { 76 return query; 77 } 78 return query + CHAR_DELIMITER + prevWordsInfo.mPrevWord; 79 } 80 81 public SuggestionsParams getSuggestionsFromCache(String query, 82 final PrevWordsInfo prevWordsInfo) { 83 return mUnigramSuggestionsInfoCache.get(generateKey(query, prevWordsInfo)); 84 } 85 86 public void putSuggestionsToCache( 87 final String query, final PrevWordsInfo prevWordsInfo, 88 final String[] suggestions, final int flags) { 89 if (suggestions == null || TextUtils.isEmpty(query)) { 90 return; 91 } 92 mUnigramSuggestionsInfoCache.put( 93 generateKey(query, prevWordsInfo), new SuggestionsParams(suggestions, flags)); 94 } 95 96 public void clearCache() { 97 mUnigramSuggestionsInfoCache.evictAll(); 98 } 99 } 100 101 AndroidWordLevelSpellCheckerSession(final AndroidSpellCheckerService service) { 102 mService = service; 103 final ContentResolver cres = service.getContentResolver(); 104 105 mObserver = new ContentObserver(null) { 106 @Override 107 public void onChange(boolean self) { 108 mSuggestionsCache.clearCache(); 109 } 110 }; 111 cres.registerContentObserver(Words.CONTENT_URI, true, mObserver); 112 } 113 114 @Override 115 public void onCreate() { 116 final String localeString = getLocale(); 117 mDictionaryPool = mService.getDictionaryPool(localeString); 118 mLocale = LocaleUtils.constructLocaleFromString(localeString); 119 mScript = AndroidSpellCheckerService.getScriptFromLocale(mLocale); 120 } 121 122 @Override 123 public void onClose() { 124 final ContentResolver cres = mService.getContentResolver(); 125 cres.unregisterContentObserver(mObserver); 126 } 127 128 /* 129 * Returns whether the code point is a letter that makes sense for the specified 130 * locale for this spell checker. 131 * The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml 132 * and is limited to EFIGS languages and Russian. 133 * Hence at the moment this explicitly tests for Cyrillic characters or Latin characters 134 * as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters. 135 */ 136 private static boolean isLetterCheckableByLanguage(final int codePoint, 137 final int script) { 138 switch (script) { 139 case AndroidSpellCheckerService.SCRIPT_LATIN: 140 // Our supported latin script dictionaries (EFIGS) at the moment only include 141 // characters in the C0, C1, Latin Extended A and B, IPA extensions unicode 142 // blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF, 143 // so the below is a very efficient way to test for it. As for the 0-0x3F, it's 144 // excluded from isLetter anyway. 145 return codePoint <= 0x2AF && Character.isLetter(codePoint); 146 case AndroidSpellCheckerService.SCRIPT_CYRILLIC: 147 // All Cyrillic characters are in the 400~52F block. There are some in the upper 148 // Unicode range, but they are archaic characters that are not used in modern 149 // Russian and are not used by our dictionary. 150 return codePoint >= 0x400 && codePoint <= 0x52F && Character.isLetter(codePoint); 151 case AndroidSpellCheckerService.SCRIPT_GREEK: 152 // Greek letters are either in the 370~3FF range (Greek & Coptic), or in the 153 // 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters. 154 // Our dictionary also contains a few words with 0xF2; it would be best to check 155 // if that's correct, but a web search does return results for these words so 156 // they are probably okay. 157 return (codePoint >= 0x370 && codePoint <= 0x3FF) 158 || (codePoint >= 0x1F00 && codePoint <= 0x1FFF) 159 || codePoint == 0xF2; 160 default: 161 // Should never come here 162 throw new RuntimeException("Impossible value of script: " + script); 163 } 164 } 165 166 private static final int CHECKABILITY_CHECKABLE = 0; 167 private static final int CHECKABILITY_TOO_MANY_NON_LETTERS = 1; 168 private static final int CHECKABILITY_CONTAINS_PERIOD = 2; 169 private static final int CHECKABILITY_EMAIL_OR_URL = 3; 170 private static final int CHECKABILITY_FIRST_LETTER_UNCHECKABLE = 4; 171 private static final int CHECKABILITY_TOO_SHORT = 5; 172 /** 173 * Finds out whether a particular string should be filtered out of spell checking. 174 * 175 * This will loosely match URLs, numbers, symbols. To avoid always underlining words that 176 * we know we will never recognize, this accepts a script identifier that should be one 177 * of the SCRIPT_* constants defined above, to rule out quickly characters from very 178 * different languages. 179 * 180 * @param text the string to evaluate. 181 * @param script the identifier for the script this spell checker recognizes 182 * @return one of the FILTER_OUT_* constants above. 183 */ 184 private static int getCheckabilityInScript(final String text, final int script) { 185 if (TextUtils.isEmpty(text) || text.length() <= 1) return CHECKABILITY_TOO_SHORT; 186 187 // TODO: check if an equivalent processing can't be done more quickly with a 188 // compiled regexp. 189 // Filter by first letter 190 final int firstCodePoint = text.codePointAt(0); 191 // Filter out words that don't start with a letter or an apostrophe 192 if (!isLetterCheckableByLanguage(firstCodePoint, script) 193 && '\'' != firstCodePoint) return CHECKABILITY_FIRST_LETTER_UNCHECKABLE; 194 195 // Filter contents 196 final int length = text.length(); 197 int letterCount = 0; 198 for (int i = 0; i < length; i = text.offsetByCodePoints(i, 1)) { 199 final int codePoint = text.codePointAt(i); 200 // Any word containing a COMMERCIAL_AT is probably an e-mail address 201 // Any word containing a SLASH is probably either an ad-hoc combination of two 202 // words or a URI - in either case we don't want to spell check that 203 if (Constants.CODE_COMMERCIAL_AT == codePoint || Constants.CODE_SLASH == codePoint) { 204 return CHECKABILITY_EMAIL_OR_URL; 205 } 206 // If the string contains a period, native returns strange suggestions (it seems 207 // to return suggestions for everything up to the period only and to ignore the 208 // rest), so we suppress lookup if there is a period. 209 // TODO: investigate why native returns these suggestions and remove this code. 210 if (Constants.CODE_PERIOD == codePoint) { 211 return CHECKABILITY_CONTAINS_PERIOD; 212 } 213 if (isLetterCheckableByLanguage(codePoint, script)) ++letterCount; 214 } 215 // Guestimate heuristic: perform spell checking if at least 3/4 of the characters 216 // in this word are letters 217 return (letterCount * 4 < length * 3) 218 ? CHECKABILITY_TOO_MANY_NON_LETTERS : CHECKABILITY_CHECKABLE; 219 } 220 221 /** 222 * Helper method to test valid capitalizations of a word. 223 * 224 * If the "text" is lower-case, we test only the exact string. 225 * If the "Text" is capitalized, we test the exact string "Text" and the lower-cased 226 * version of it "text". 227 * If the "TEXT" is fully upper case, we test the exact string "TEXT", the lower-cased 228 * version of it "text" and the capitalized version of it "Text". 229 */ 230 private boolean isInDictForAnyCapitalization(final Dictionary dict, final String text, 231 final int capitalizeType) { 232 // If the word is in there as is, then it's in the dictionary. If not, we'll test lower 233 // case versions, but only if the word is not already all-lower case or mixed case. 234 if (dict.isValidWord(text)) return true; 235 if (StringUtils.CAPITALIZE_NONE == capitalizeType) return false; 236 237 // If we come here, we have a capitalized word (either First- or All-). 238 // Downcase the word and look it up again. If the word is only capitalized, we 239 // tested all possibilities, so if it's still negative we can return false. 240 final String lowerCaseText = text.toLowerCase(mLocale); 241 if (dict.isValidWord(lowerCaseText)) return true; 242 if (StringUtils.CAPITALIZE_FIRST == capitalizeType) return false; 243 244 // If the lower case version is not in the dictionary, it's still possible 245 // that we have an all-caps version of a word that needs to be capitalized 246 // according to the dictionary. E.g. "GERMANS" only exists in the dictionary as "Germans". 247 return dict.isValidWord(StringUtils.capitalizeFirstAndDowncaseRest(lowerCaseText, mLocale)); 248 } 249 250 // Note : this must be reentrant 251 /** 252 * Gets a list of suggestions for a specific string. This returns a list of possible 253 * corrections for the text passed as an argument. It may split or group words, and 254 * even perform grammatical analysis. 255 */ 256 private SuggestionsInfo onGetSuggestionsInternal(final TextInfo textInfo, 257 final int suggestionsLimit) { 258 return onGetSuggestionsInternal(textInfo, null, suggestionsLimit); 259 } 260 261 protected SuggestionsInfo onGetSuggestionsInternal( 262 final TextInfo textInfo, final PrevWordsInfo prevWordsInfo, 263 final int suggestionsLimit) { 264 try { 265 final String inText = textInfo.getText(); 266 final SuggestionsParams cachedSuggestionsParams = 267 mSuggestionsCache.getSuggestionsFromCache(inText, prevWordsInfo); 268 if (cachedSuggestionsParams != null) { 269 if (DBG) { 270 Log.d(TAG, "Cache hit: " + inText + ", " + cachedSuggestionsParams.mFlags); 271 } 272 return new SuggestionsInfo( 273 cachedSuggestionsParams.mFlags, cachedSuggestionsParams.mSuggestions); 274 } 275 276 final int checkability = getCheckabilityInScript(inText, mScript); 277 if (CHECKABILITY_CHECKABLE != checkability) { 278 DictAndKeyboard dictInfo = null; 279 try { 280 dictInfo = mDictionaryPool.pollWithDefaultTimeout(); 281 if (!DictionaryPool.isAValidDictionary(dictInfo)) { 282 return AndroidSpellCheckerService.getNotInDictEmptySuggestions( 283 false /* reportAsTypo */); 284 } 285 return dictInfo.mDictionary.isValidWord(inText) 286 ? AndroidSpellCheckerService.getInDictEmptySuggestions() 287 : AndroidSpellCheckerService.getNotInDictEmptySuggestions( 288 CHECKABILITY_CONTAINS_PERIOD == checkability 289 /* reportAsTypo */); 290 } finally { 291 if (null != dictInfo) { 292 if (!mDictionaryPool.offer(dictInfo)) { 293 Log.e(TAG, "Can't re-insert a dictionary into its pool"); 294 } 295 } 296 } 297 } 298 final String text = inText.replaceAll( 299 AndroidSpellCheckerService.APOSTROPHE, AndroidSpellCheckerService.SINGLE_QUOTE); 300 301 // TODO: Don't gather suggestions if the limit is <= 0 unless necessary 302 //final SuggestionsGatherer suggestionsGatherer = new SuggestionsGatherer(text, 303 //mService.mSuggestionThreshold, mService.mRecommendedThreshold, 304 //suggestionsLimit); 305 final SuggestionsGatherer suggestionsGatherer = mService.newSuggestionsGatherer( 306 text, suggestionsLimit); 307 308 final int capitalizeType = StringUtils.getCapitalizationType(text); 309 boolean isInDict = true; 310 DictAndKeyboard dictInfo = null; 311 try { 312 dictInfo = mDictionaryPool.pollWithDefaultTimeout(); 313 if (!DictionaryPool.isAValidDictionary(dictInfo)) { 314 return AndroidSpellCheckerService.getNotInDictEmptySuggestions( 315 false /* reportAsTypo */); 316 } 317 final WordComposer composer = new WordComposer(); 318 final int[] codePoints = StringUtils.toCodePointArray(text); 319 final int[] coordinates; 320 if (null == dictInfo.mKeyboard) { 321 coordinates = CoordinateUtils.newCoordinateArray(codePoints.length, 322 Constants.NOT_A_COORDINATE, Constants.NOT_A_COORDINATE); 323 } else { 324 coordinates = dictInfo.mKeyboard.getCoordinates(codePoints); 325 } 326 composer.setComposingWord(codePoints, coordinates, null /* previousWord */); 327 // TODO: make a spell checker option to block offensive words or not 328 final ArrayList<SuggestedWordInfo> suggestions = 329 dictInfo.mDictionary.getSuggestions(composer, prevWordsInfo, 330 dictInfo.getProximityInfo(), true /* blockOffensiveWords */, 331 null /* additionalFeaturesOptions */, 0 /* sessionId */, 332 null /* inOutLanguageWeight */); 333 if (suggestions != null) { 334 for (final SuggestedWordInfo suggestion : suggestions) { 335 final String suggestionStr = suggestion.mWord; 336 suggestionsGatherer.addWord(suggestionStr.toCharArray(), null, 0, 337 suggestionStr.length(), suggestion.mScore); 338 } 339 } 340 isInDict = isInDictForAnyCapitalization(dictInfo.mDictionary, text, capitalizeType); 341 } finally { 342 if (null != dictInfo) { 343 if (!mDictionaryPool.offer(dictInfo)) { 344 Log.e(TAG, "Can't re-insert a dictionary into its pool"); 345 } 346 } 347 } 348 349 final SuggestionsGatherer.Result result = suggestionsGatherer.getResults( 350 capitalizeType, mLocale); 351 352 if (DBG) { 353 Log.i(TAG, "Spell checking results for " + text + " with suggestion limit " 354 + suggestionsLimit); 355 Log.i(TAG, "IsInDict = " + isInDict); 356 Log.i(TAG, "LooksLikeTypo = " + (!isInDict)); 357 Log.i(TAG, "HasRecommendedSuggestions = " + result.mHasRecommendedSuggestions); 358 if (null != result.mSuggestions) { 359 for (String suggestion : result.mSuggestions) { 360 Log.i(TAG, suggestion); 361 } 362 } 363 } 364 365 final int flags = 366 (isInDict ? SuggestionsInfo.RESULT_ATTR_IN_THE_DICTIONARY 367 : SuggestionsInfo.RESULT_ATTR_LOOKS_LIKE_TYPO) 368 | (result.mHasRecommendedSuggestions 369 ? SuggestionsInfoCompatUtils 370 .getValueOf_RESULT_ATTR_HAS_RECOMMENDED_SUGGESTIONS() 371 : 0); 372 final SuggestionsInfo retval = new SuggestionsInfo(flags, result.mSuggestions); 373 mSuggestionsCache.putSuggestionsToCache(text, prevWordsInfo, result.mSuggestions, 374 flags); 375 return retval; 376 } catch (RuntimeException e) { 377 // Don't kill the keyboard if there is a bug in the spell checker 378 if (DBG) { 379 throw e; 380 } else { 381 Log.e(TAG, "Exception while spellcheking", e); 382 return AndroidSpellCheckerService.getNotInDictEmptySuggestions( 383 false /* reportAsTypo */); 384 } 385 } 386 } 387 388 /* 389 * The spell checker acts on its own behalf. That is needed, in particular, to be able to 390 * access the dictionary files, which the provider restricts to the identity of Latin IME. 391 * Since it's called externally by the application, the spell checker is using the identity 392 * of the application by default unless we clearCallingIdentity. 393 * That's what the following method does. 394 */ 395 @Override 396 public SuggestionsInfo onGetSuggestions(final TextInfo textInfo, 397 final int suggestionsLimit) { 398 long ident = Binder.clearCallingIdentity(); 399 try { 400 return onGetSuggestionsInternal(textInfo, suggestionsLimit); 401 } finally { 402 Binder.restoreCallingIdentity(ident); 403 } 404 } 405} 406