AndroidWordLevelSpellCheckerSession.java revision a91561aa58db1c43092c1caecc051a11fa5391c7
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.spellcheck;
18
19import android.content.ContentResolver;
20import android.database.ContentObserver;
21import android.os.Binder;
22import android.provider.UserDictionary.Words;
23import android.service.textservice.SpellCheckerService.Session;
24import android.text.TextUtils;
25import android.util.Log;
26import android.util.LruCache;
27import android.view.textservice.SuggestionsInfo;
28import android.view.textservice.TextInfo;
29
30import com.android.inputmethod.compat.SuggestionsInfoCompatUtils;
31import com.android.inputmethod.latin.Constants;
32import com.android.inputmethod.latin.Dictionary;
33import com.android.inputmethod.latin.PrevWordsInfo;
34import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
35import com.android.inputmethod.latin.WordComposer;
36import com.android.inputmethod.latin.spellcheck.AndroidSpellCheckerService.SuggestionsGatherer;
37import com.android.inputmethod.latin.utils.CoordinateUtils;
38import com.android.inputmethod.latin.utils.LocaleUtils;
39import com.android.inputmethod.latin.utils.StringUtils;
40
41import java.util.ArrayList;
42import java.util.Locale;
43
44public abstract class AndroidWordLevelSpellCheckerSession extends Session {
45    private static final String TAG = AndroidWordLevelSpellCheckerSession.class.getSimpleName();
46    private static final boolean DBG = false;
47
48    // Immutable, but need the locale which is not available in the constructor yet
49    private DictionaryPool mDictionaryPool;
50    // Likewise
51    private Locale mLocale;
52    // Cache this for performance
53    private int mScript; // One of SCRIPT_LATIN or SCRIPT_CYRILLIC for now.
54    private final AndroidSpellCheckerService mService;
55    protected final SuggestionsCache mSuggestionsCache = new SuggestionsCache();
56    private final ContentObserver mObserver;
57
58    private static final class SuggestionsParams {
59        public final String[] mSuggestions;
60        public final int mFlags;
61        public SuggestionsParams(String[] suggestions, int flags) {
62            mSuggestions = suggestions;
63            mFlags = flags;
64        }
65    }
66
67    protected static final class SuggestionsCache {
68        private static final char CHAR_DELIMITER = '\uFFFC';
69        private static final int MAX_CACHE_SIZE = 50;
70        private final LruCache<String, SuggestionsParams> mUnigramSuggestionsInfoCache =
71                new LruCache<>(MAX_CACHE_SIZE);
72
73        // TODO: Support n-gram input
74        private static String generateKey(final String query, final PrevWordsInfo prevWordsInfo) {
75            if (TextUtils.isEmpty(query) || TextUtils.isEmpty(prevWordsInfo.mPrevWord)) {
76                return query;
77            }
78            return query + CHAR_DELIMITER + prevWordsInfo.mPrevWord;
79        }
80
81        public SuggestionsParams getSuggestionsFromCache(String query,
82                final PrevWordsInfo prevWordsInfo) {
83            return mUnigramSuggestionsInfoCache.get(generateKey(query, prevWordsInfo));
84        }
85
86        public void putSuggestionsToCache(
87                final String query, final PrevWordsInfo prevWordsInfo,
88                final String[] suggestions, final int flags) {
89            if (suggestions == null || TextUtils.isEmpty(query)) {
90                return;
91            }
92            mUnigramSuggestionsInfoCache.put(
93                    generateKey(query, prevWordsInfo), new SuggestionsParams(suggestions, flags));
94        }
95
96        public void clearCache() {
97            mUnigramSuggestionsInfoCache.evictAll();
98        }
99    }
100
101    AndroidWordLevelSpellCheckerSession(final AndroidSpellCheckerService service) {
102        mService = service;
103        final ContentResolver cres = service.getContentResolver();
104
105        mObserver = new ContentObserver(null) {
106            @Override
107            public void onChange(boolean self) {
108                mSuggestionsCache.clearCache();
109            }
110        };
111        cres.registerContentObserver(Words.CONTENT_URI, true, mObserver);
112    }
113
114    @Override
115    public void onCreate() {
116        final String localeString = getLocale();
117        mDictionaryPool = mService.getDictionaryPool(localeString);
118        mLocale = LocaleUtils.constructLocaleFromString(localeString);
119        mScript = AndroidSpellCheckerService.getScriptFromLocale(mLocale);
120    }
121
122    @Override
123    public void onClose() {
124        final ContentResolver cres = mService.getContentResolver();
125        cres.unregisterContentObserver(mObserver);
126    }
127
128    /*
129     * Returns whether the code point is a letter that makes sense for the specified
130     * locale for this spell checker.
131     * The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml
132     * and is limited to EFIGS languages and Russian.
133     * Hence at the moment this explicitly tests for Cyrillic characters or Latin characters
134     * as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters.
135     */
136    private static boolean isLetterCheckableByLanguage(final int codePoint,
137            final int script) {
138        switch (script) {
139        case AndroidSpellCheckerService.SCRIPT_LATIN:
140            // Our supported latin script dictionaries (EFIGS) at the moment only include
141            // characters in the C0, C1, Latin Extended A and B, IPA extensions unicode
142            // blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF,
143            // so the below is a very efficient way to test for it. As for the 0-0x3F, it's
144            // excluded from isLetter anyway.
145            return codePoint <= 0x2AF && Character.isLetter(codePoint);
146        case AndroidSpellCheckerService.SCRIPT_CYRILLIC:
147            // All Cyrillic characters are in the 400~52F block. There are some in the upper
148            // Unicode range, but they are archaic characters that are not used in modern
149            // Russian and are not used by our dictionary.
150            return codePoint >= 0x400 && codePoint <= 0x52F && Character.isLetter(codePoint);
151        case AndroidSpellCheckerService.SCRIPT_GREEK:
152            // Greek letters are either in the 370~3FF range (Greek & Coptic), or in the
153            // 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters.
154            // Our dictionary also contains a few words with 0xF2; it would be best to check
155            // if that's correct, but a web search does return results for these words so
156            // they are probably okay.
157            return (codePoint >= 0x370 && codePoint <= 0x3FF)
158                    || (codePoint >= 0x1F00 && codePoint <= 0x1FFF)
159                    || codePoint == 0xF2;
160        default:
161            // Should never come here
162            throw new RuntimeException("Impossible value of script: " + script);
163        }
164    }
165
166    private static final int CHECKABILITY_CHECKABLE = 0;
167    private static final int CHECKABILITY_TOO_MANY_NON_LETTERS = 1;
168    private static final int CHECKABILITY_CONTAINS_PERIOD = 2;
169    private static final int CHECKABILITY_EMAIL_OR_URL = 3;
170    private static final int CHECKABILITY_FIRST_LETTER_UNCHECKABLE = 4;
171    private static final int CHECKABILITY_TOO_SHORT = 5;
172    /**
173     * Finds out whether a particular string should be filtered out of spell checking.
174     *
175     * This will loosely match URLs, numbers, symbols. To avoid always underlining words that
176     * we know we will never recognize, this accepts a script identifier that should be one
177     * of the SCRIPT_* constants defined above, to rule out quickly characters from very
178     * different languages.
179     *
180     * @param text the string to evaluate.
181     * @param script the identifier for the script this spell checker recognizes
182     * @return one of the FILTER_OUT_* constants above.
183     */
184    private static int getCheckabilityInScript(final String text, final int script) {
185        if (TextUtils.isEmpty(text) || text.length() <= 1) return CHECKABILITY_TOO_SHORT;
186
187        // TODO: check if an equivalent processing can't be done more quickly with a
188        // compiled regexp.
189        // Filter by first letter
190        final int firstCodePoint = text.codePointAt(0);
191        // Filter out words that don't start with a letter or an apostrophe
192        if (!isLetterCheckableByLanguage(firstCodePoint, script)
193                && '\'' != firstCodePoint) return CHECKABILITY_FIRST_LETTER_UNCHECKABLE;
194
195        // Filter contents
196        final int length = text.length();
197        int letterCount = 0;
198        for (int i = 0; i < length; i = text.offsetByCodePoints(i, 1)) {
199            final int codePoint = text.codePointAt(i);
200            // Any word containing a COMMERCIAL_AT is probably an e-mail address
201            // Any word containing a SLASH is probably either an ad-hoc combination of two
202            // words or a URI - in either case we don't want to spell check that
203            if (Constants.CODE_COMMERCIAL_AT == codePoint || Constants.CODE_SLASH == codePoint) {
204                return CHECKABILITY_EMAIL_OR_URL;
205            }
206            // If the string contains a period, native returns strange suggestions (it seems
207            // to return suggestions for everything up to the period only and to ignore the
208            // rest), so we suppress lookup if there is a period.
209            // TODO: investigate why native returns these suggestions and remove this code.
210            if (Constants.CODE_PERIOD == codePoint) {
211                return CHECKABILITY_CONTAINS_PERIOD;
212            }
213            if (isLetterCheckableByLanguage(codePoint, script)) ++letterCount;
214        }
215        // Guestimate heuristic: perform spell checking if at least 3/4 of the characters
216        // in this word are letters
217        return (letterCount * 4 < length * 3)
218                ? CHECKABILITY_TOO_MANY_NON_LETTERS : CHECKABILITY_CHECKABLE;
219    }
220
221    /**
222     * Helper method to test valid capitalizations of a word.
223     *
224     * If the "text" is lower-case, we test only the exact string.
225     * If the "Text" is capitalized, we test the exact string "Text" and the lower-cased
226     *  version of it "text".
227     * If the "TEXT" is fully upper case, we test the exact string "TEXT", the lower-cased
228     *  version of it "text" and the capitalized version of it "Text".
229     */
230    private boolean isInDictForAnyCapitalization(final Dictionary dict, final String text,
231            final int capitalizeType) {
232        // If the word is in there as is, then it's in the dictionary. If not, we'll test lower
233        // case versions, but only if the word is not already all-lower case or mixed case.
234        if (dict.isValidWord(text)) return true;
235        if (StringUtils.CAPITALIZE_NONE == capitalizeType) return false;
236
237        // If we come here, we have a capitalized word (either First- or All-).
238        // Downcase the word and look it up again. If the word is only capitalized, we
239        // tested all possibilities, so if it's still negative we can return false.
240        final String lowerCaseText = text.toLowerCase(mLocale);
241        if (dict.isValidWord(lowerCaseText)) return true;
242        if (StringUtils.CAPITALIZE_FIRST == capitalizeType) return false;
243
244        // If the lower case version is not in the dictionary, it's still possible
245        // that we have an all-caps version of a word that needs to be capitalized
246        // according to the dictionary. E.g. "GERMANS" only exists in the dictionary as "Germans".
247        return dict.isValidWord(StringUtils.capitalizeFirstAndDowncaseRest(lowerCaseText, mLocale));
248    }
249
250    // Note : this must be reentrant
251    /**
252     * Gets a list of suggestions for a specific string. This returns a list of possible
253     * corrections for the text passed as an argument. It may split or group words, and
254     * even perform grammatical analysis.
255     */
256    private SuggestionsInfo onGetSuggestionsInternal(final TextInfo textInfo,
257            final int suggestionsLimit) {
258        return onGetSuggestionsInternal(textInfo, null, suggestionsLimit);
259    }
260
261    protected SuggestionsInfo onGetSuggestionsInternal(
262            final TextInfo textInfo, final PrevWordsInfo prevWordsInfo,
263            final int suggestionsLimit) {
264        try {
265            final String inText = textInfo.getText();
266            final SuggestionsParams cachedSuggestionsParams =
267                    mSuggestionsCache.getSuggestionsFromCache(inText, prevWordsInfo);
268            if (cachedSuggestionsParams != null) {
269                if (DBG) {
270                    Log.d(TAG, "Cache hit: " + inText + ", " + cachedSuggestionsParams.mFlags);
271                }
272                return new SuggestionsInfo(
273                        cachedSuggestionsParams.mFlags, cachedSuggestionsParams.mSuggestions);
274            }
275
276            final int checkability = getCheckabilityInScript(inText, mScript);
277            if (CHECKABILITY_CHECKABLE != checkability) {
278                DictAndKeyboard dictInfo = null;
279                try {
280                    dictInfo = mDictionaryPool.pollWithDefaultTimeout();
281                    if (!DictionaryPool.isAValidDictionary(dictInfo)) {
282                        return AndroidSpellCheckerService.getNotInDictEmptySuggestions(
283                                false /* reportAsTypo */);
284                    }
285                    return dictInfo.mDictionary.isValidWord(inText)
286                            ? AndroidSpellCheckerService.getInDictEmptySuggestions()
287                            : AndroidSpellCheckerService.getNotInDictEmptySuggestions(
288                                    CHECKABILITY_CONTAINS_PERIOD == checkability
289                                    /* reportAsTypo */);
290                } finally {
291                    if (null != dictInfo) {
292                        if (!mDictionaryPool.offer(dictInfo)) {
293                            Log.e(TAG, "Can't re-insert a dictionary into its pool");
294                        }
295                    }
296                }
297            }
298            final String text = inText.replaceAll(
299                    AndroidSpellCheckerService.APOSTROPHE, AndroidSpellCheckerService.SINGLE_QUOTE);
300
301            // TODO: Don't gather suggestions if the limit is <= 0 unless necessary
302            //final SuggestionsGatherer suggestionsGatherer = new SuggestionsGatherer(text,
303            //mService.mSuggestionThreshold, mService.mRecommendedThreshold,
304            //suggestionsLimit);
305            final SuggestionsGatherer suggestionsGatherer = mService.newSuggestionsGatherer(
306                    text, suggestionsLimit);
307
308            final int capitalizeType = StringUtils.getCapitalizationType(text);
309            boolean isInDict = true;
310            DictAndKeyboard dictInfo = null;
311            try {
312                dictInfo = mDictionaryPool.pollWithDefaultTimeout();
313                if (!DictionaryPool.isAValidDictionary(dictInfo)) {
314                    return AndroidSpellCheckerService.getNotInDictEmptySuggestions(
315                            false /* reportAsTypo */);
316                }
317                final WordComposer composer = new WordComposer();
318                final int[] codePoints = StringUtils.toCodePointArray(text);
319                final int[] coordinates;
320                if (null == dictInfo.mKeyboard) {
321                    coordinates = CoordinateUtils.newCoordinateArray(codePoints.length,
322                            Constants.NOT_A_COORDINATE, Constants.NOT_A_COORDINATE);
323                } else {
324                    coordinates = dictInfo.mKeyboard.getCoordinates(codePoints);
325                }
326                composer.setComposingWord(codePoints, coordinates, null /* previousWord */);
327                // TODO: make a spell checker option to block offensive words or not
328                final ArrayList<SuggestedWordInfo> suggestions =
329                        dictInfo.mDictionary.getSuggestions(composer, prevWordsInfo,
330                                dictInfo.getProximityInfo(), true /* blockOffensiveWords */,
331                                null /* additionalFeaturesOptions */, 0 /* sessionId */,
332                                null /* inOutLanguageWeight */);
333                if (suggestions != null) {
334                    for (final SuggestedWordInfo suggestion : suggestions) {
335                        final String suggestionStr = suggestion.mWord;
336                        suggestionsGatherer.addWord(suggestionStr.toCharArray(), null, 0,
337                                suggestionStr.length(), suggestion.mScore);
338                    }
339                }
340                isInDict = isInDictForAnyCapitalization(dictInfo.mDictionary, text, capitalizeType);
341            } finally {
342                if (null != dictInfo) {
343                    if (!mDictionaryPool.offer(dictInfo)) {
344                        Log.e(TAG, "Can't re-insert a dictionary into its pool");
345                    }
346                }
347            }
348
349            final SuggestionsGatherer.Result result = suggestionsGatherer.getResults(
350                    capitalizeType, mLocale);
351
352            if (DBG) {
353                Log.i(TAG, "Spell checking results for " + text + " with suggestion limit "
354                        + suggestionsLimit);
355                Log.i(TAG, "IsInDict = " + isInDict);
356                Log.i(TAG, "LooksLikeTypo = " + (!isInDict));
357                Log.i(TAG, "HasRecommendedSuggestions = " + result.mHasRecommendedSuggestions);
358                if (null != result.mSuggestions) {
359                    for (String suggestion : result.mSuggestions) {
360                        Log.i(TAG, suggestion);
361                    }
362                }
363            }
364
365            final int flags =
366                    (isInDict ? SuggestionsInfo.RESULT_ATTR_IN_THE_DICTIONARY
367                            : SuggestionsInfo.RESULT_ATTR_LOOKS_LIKE_TYPO)
368                    | (result.mHasRecommendedSuggestions
369                            ? SuggestionsInfoCompatUtils
370                                    .getValueOf_RESULT_ATTR_HAS_RECOMMENDED_SUGGESTIONS()
371                            : 0);
372            final SuggestionsInfo retval = new SuggestionsInfo(flags, result.mSuggestions);
373            mSuggestionsCache.putSuggestionsToCache(text, prevWordsInfo, result.mSuggestions,
374                    flags);
375            return retval;
376        } catch (RuntimeException e) {
377            // Don't kill the keyboard if there is a bug in the spell checker
378            if (DBG) {
379                throw e;
380            } else {
381                Log.e(TAG, "Exception while spellcheking", e);
382                return AndroidSpellCheckerService.getNotInDictEmptySuggestions(
383                        false /* reportAsTypo */);
384            }
385        }
386    }
387
388    /*
389     * The spell checker acts on its own behalf. That is needed, in particular, to be able to
390     * access the dictionary files, which the provider restricts to the identity of Latin IME.
391     * Since it's called externally by the application, the spell checker is using the identity
392     * of the application by default unless we clearCallingIdentity.
393     * That's what the following method does.
394     */
395    @Override
396    public SuggestionsInfo onGetSuggestions(final TextInfo textInfo,
397            final int suggestionsLimit) {
398        long ident = Binder.clearCallingIdentity();
399        try {
400            return onGetSuggestionsInternal(textInfo, suggestionsLimit);
401        } finally {
402            Binder.restoreCallingIdentity(ident);
403        }
404    }
405}
406