Suggest.java revision 6234be1fe76740c458781b633f4ac66edd8ea84f
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.content.Context;
20import android.text.TextUtils;
21import android.util.Log;
22
23import com.android.inputmethod.keyboard.Keyboard;
24import com.android.inputmethod.keyboard.ProximityInfo;
25import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
26
27import java.io.File;
28import java.util.ArrayList;
29import java.util.HashMap;
30import java.util.HashSet;
31import java.util.Locale;
32import java.util.concurrent.ConcurrentHashMap;
33
34/**
35 * This class loads a dictionary and provides a list of suggestions for a given sequence of
36 * characters. This includes corrections and completions.
37 */
38public class Suggest {
39    public static final String TAG = Suggest.class.getSimpleName();
40
41    public static final int APPROX_MAX_WORD_LENGTH = 32;
42
43    // TODO: rename this to CORRECTION_OFF
44    public static final int CORRECTION_NONE = 0;
45    // TODO: rename this to CORRECTION_ON
46    public static final int CORRECTION_FULL = 1;
47
48    // It seems the following values are only used for logging.
49    public static final int DIC_USER_TYPED = 0;
50    public static final int DIC_MAIN = 1;
51    public static final int DIC_USER = 2;
52    public static final int DIC_USER_HISTORY = 3;
53    public static final int DIC_CONTACTS = 4;
54    public static final int DIC_WHITELIST = 6;
55    // If you add a type of dictionary, increment DIC_TYPE_LAST_ID
56    // TODO: this value seems unused. Remove it?
57    public static final int DIC_TYPE_LAST_ID = 6;
58    public static final String DICT_KEY_MAIN = "main";
59    public static final String DICT_KEY_CONTACTS = "contacts";
60    // User dictionary, the system-managed one.
61    public static final String DICT_KEY_USER = "user";
62    // User history dictionary internal to LatinIME
63    public static final String DICT_KEY_USER_HISTORY = "history";
64    public static final String DICT_KEY_WHITELIST ="whitelist";
65    // TODO: remove this map. This only serves as backward compatibility with a feature
66    // that has never been used and has been broken for a while.
67    private static final HashMap<String, Integer> sDictKeyToDictIndex
68            = new HashMap<String, Integer>();
69    static {
70        sDictKeyToDictIndex.put(DICT_KEY_MAIN, DIC_MAIN);
71        sDictKeyToDictIndex.put(DICT_KEY_USER, DIC_USER);
72        sDictKeyToDictIndex.put(DICT_KEY_USER_HISTORY, DIC_USER_HISTORY);
73        sDictKeyToDictIndex.put(DICT_KEY_CONTACTS, DIC_CONTACTS);
74        sDictKeyToDictIndex.put(DICT_KEY_WHITELIST, DIC_WHITELIST);
75    }
76
77    private static final boolean DBG = LatinImeLogger.sDBG;
78
79    private Dictionary mMainDictionary;
80    private ContactsBinaryDictionary mContactsDict;
81    private WhitelistDictionary mWhiteListDictionary;
82    private final ConcurrentHashMap<String, Dictionary> mDictionaries =
83            new ConcurrentHashMap<String, Dictionary>();
84
85    public static final int MAX_SUGGESTIONS = 18;
86
87    private float mAutoCorrectionThreshold;
88
89    private ArrayList<SuggestedWordInfo> mSuggestions = new ArrayList<SuggestedWordInfo>();
90    private CharSequence mConsideredWord;
91
92    // TODO: Remove these member variables by passing more context to addWord() callback method
93    private boolean mIsFirstCharCapitalized;
94    private boolean mIsAllUpperCase;
95    private int mTrailingSingleQuotesCount;
96
97    private static final int MINIMUM_SAFETY_NET_CHAR_LENGTH = 4;
98
99    public Suggest(final Context context, final Locale locale) {
100        initAsynchronously(context, locale);
101    }
102
103    /* package for test */ Suggest(final Context context, final File dictionary,
104            final long startOffset, final long length, final Locale locale) {
105        final Dictionary mainDict = DictionaryFactory.createDictionaryForTest(context, dictionary,
106                startOffset, length /* useFullEditDistance */, false, locale);
107        mMainDictionary = mainDict;
108        addOrReplaceDictionary(mDictionaries, DICT_KEY_MAIN, mainDict);
109        initWhitelistAndAutocorrectAndPool(context, locale);
110    }
111
112    private void initWhitelistAndAutocorrectAndPool(final Context context, final Locale locale) {
113        mWhiteListDictionary = new WhitelistDictionary(context, locale);
114        addOrReplaceDictionary(mDictionaries, DICT_KEY_WHITELIST, mWhiteListDictionary);
115    }
116
117    private void initAsynchronously(final Context context, final Locale locale) {
118        resetMainDict(context, locale);
119
120        // TODO: read the whitelist and init the pool asynchronously too.
121        // initPool should be done asynchronously now that the pool is thread-safe.
122        initWhitelistAndAutocorrectAndPool(context, locale);
123    }
124
125    private static void addOrReplaceDictionary(
126            final ConcurrentHashMap<String, Dictionary> dictionaries,
127            final String key, final Dictionary dict) {
128        final Dictionary oldDict = (dict == null)
129                ? dictionaries.remove(key)
130                : dictionaries.put(key, dict);
131        if (oldDict != null && dict != oldDict) {
132            oldDict.close();
133        }
134    }
135
136    public void resetMainDict(final Context context, final Locale locale) {
137        mMainDictionary = null;
138        new Thread("InitializeBinaryDictionary") {
139            @Override
140            public void run() {
141                final DictionaryCollection newMainDict =
142                        DictionaryFactory.createMainDictionaryFromManager(context, locale);
143                addOrReplaceDictionary(mDictionaries, DICT_KEY_MAIN, newMainDict);
144                mMainDictionary = newMainDict;
145            }
146        }.start();
147    }
148
149    // The main dictionary could have been loaded asynchronously.  Don't cache the return value
150    // of this method.
151    public boolean hasMainDictionary() {
152        return null != mMainDictionary && mMainDictionary.isInitialized();
153    }
154
155    public Dictionary getMainDictionary() {
156        return mMainDictionary;
157    }
158
159    public ContactsBinaryDictionary getContactsDictionary() {
160        return mContactsDict;
161    }
162
163    public ConcurrentHashMap<String, Dictionary> getUnigramDictionaries() {
164        return mDictionaries;
165    }
166
167    public static int getApproxMaxWordLength() {
168        return APPROX_MAX_WORD_LENGTH;
169    }
170
171    /**
172     * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted
173     * before the main dictionary, if set. This refers to the system-managed user dictionary.
174     */
175    public void setUserDictionary(UserBinaryDictionary userDictionary) {
176        addOrReplaceDictionary(mDictionaries, DICT_KEY_USER, userDictionary);
177    }
178
179    /**
180     * Sets an optional contacts dictionary resource to be loaded. It is also possible to remove
181     * the contacts dictionary by passing null to this method. In this case no contacts dictionary
182     * won't be used.
183     */
184    public void setContactsDictionary(ContactsBinaryDictionary contactsDictionary) {
185        mContactsDict = contactsDictionary;
186        addOrReplaceDictionary(mDictionaries, DICT_KEY_CONTACTS, contactsDictionary);
187    }
188
189    public void setUserHistoryDictionary(UserHistoryDictionary userHistoryDictionary) {
190        addOrReplaceDictionary(mDictionaries, DICT_KEY_USER_HISTORY, userHistoryDictionary);
191    }
192
193    public void setAutoCorrectionThreshold(float threshold) {
194        mAutoCorrectionThreshold = threshold;
195    }
196
197    private static CharSequence capitalizeWord(final boolean all, final boolean first,
198            final CharSequence word) {
199        if (TextUtils.isEmpty(word) || !(all || first)) return word;
200        final int wordLength = word.length();
201        final StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
202        // TODO: Must pay attention to locale when changing case.
203        if (all) {
204            sb.append(word.toString().toUpperCase());
205        } else if (first) {
206            sb.append(Character.toUpperCase(word.charAt(0)));
207            if (wordLength > 1) {
208                sb.append(word.subSequence(1, wordLength));
209            }
210        }
211        return sb;
212    }
213
214    // Compatibility for tests. TODO: remove this
215    public SuggestedWords getSuggestedWords(
216            final WordComposer wordComposer, CharSequence prevWordForBigram,
217            final ProximityInfo proximityInfo, final boolean isCorrectionEnabled) {
218        return getSuggestedWords(wordComposer, prevWordForBigram, proximityInfo,
219                isCorrectionEnabled, false);
220    }
221
222    // TODO: cleanup dictionaries looking up and suggestions building with SuggestedWords.Builder
223    public SuggestedWords getSuggestedWords(
224            final WordComposer wordComposer, CharSequence prevWordForBigram,
225            final ProximityInfo proximityInfo, final boolean isCorrectionEnabled,
226            final boolean isPrediction) {
227        LatinImeLogger.onStartSuggestion(prevWordForBigram);
228        mIsFirstCharCapitalized = !isPrediction && wordComposer.isFirstCharCapitalized();
229        mIsAllUpperCase = !isPrediction && wordComposer.isAllUpperCase();
230        mTrailingSingleQuotesCount = wordComposer.trailingSingleQuotesCount();
231        mSuggestions = new ArrayList<SuggestedWordInfo>(MAX_SUGGESTIONS);
232
233        final String typedWord = wordComposer.getTypedWord();
234        final String consideredWord = mTrailingSingleQuotesCount > 0
235                ? typedWord.substring(0, typedWord.length() - mTrailingSingleQuotesCount)
236                : typedWord;
237        // Treating USER_TYPED as UNIGRAM suggestion for logging now.
238        LatinImeLogger.onAddSuggestedWord(typedWord, Suggest.DIC_USER_TYPED, Dictionary.UNIGRAM);
239        mConsideredWord = consideredWord;
240
241        if (wordComposer.size() <= 1 && isCorrectionEnabled) {
242            // At first character typed, search only the bigrams
243            if (!TextUtils.isEmpty(prevWordForBigram)) {
244                final CharSequence lowerPrevWord;
245                if (StringUtils.hasUpperCase(prevWordForBigram)) {
246                    // TODO: Must pay attention to locale when changing case.
247                    lowerPrevWord = prevWordForBigram.toString().toLowerCase();
248                } else {
249                    lowerPrevWord = null;
250                }
251                for (final String key : mDictionaries.keySet()) {
252                    final int dicTypeId = sDictKeyToDictIndex.get(key);
253                    final Dictionary dictionary = mDictionaries.get(key);
254                    final ArrayList<SuggestedWordInfo> suggestions =
255                            dictionary.getBigrams(wordComposer, prevWordForBigram);
256                    if (null != lowerPrevWord) {
257                        suggestions.addAll(dictionary.getBigrams(wordComposer, lowerPrevWord));
258                    }
259                    for (final SuggestedWordInfo suggestion : suggestions) {
260                        final String suggestionStr = suggestion.mWord.toString();
261                        addWord(suggestionStr.toCharArray(), null, 0, suggestionStr.length(),
262                                suggestion.mScore, dicTypeId, Dictionary.BIGRAM, mSuggestions);
263                    }
264                }
265            }
266        } else if (wordComposer.size() > 1) {
267            final WordComposer wordComposerForLookup;
268            if (mTrailingSingleQuotesCount > 0) {
269                wordComposerForLookup = new WordComposer(wordComposer);
270                for (int i = mTrailingSingleQuotesCount - 1; i >= 0; --i) {
271                    wordComposerForLookup.deleteLast();
272                }
273            } else {
274                wordComposerForLookup = wordComposer;
275            }
276            // At second character typed, search the unigrams (scores being affected by bigrams)
277            for (final String key : mDictionaries.keySet()) {
278                // Skip UserUnigramDictionary and WhitelistDictionary to lookup
279                if (key.equals(DICT_KEY_USER_HISTORY) || key.equals(DICT_KEY_WHITELIST))
280                    continue;
281                final int dicTypeId = sDictKeyToDictIndex.get(key);
282                final Dictionary dictionary = mDictionaries.get(key);
283                final ArrayList<SuggestedWordInfo> suggestions = dictionary.getWords(
284                        wordComposerForLookup, prevWordForBigram, proximityInfo);
285                for (final SuggestedWordInfo suggestion : suggestions) {
286                    final String suggestionStr = suggestion.mWord.toString();
287                    addWord(suggestionStr.toCharArray(), null, 0, suggestionStr.length(),
288                            suggestion.mScore, dicTypeId, Dictionary.UNIGRAM, mSuggestions);
289                }
290            }
291        }
292
293        final CharSequence whitelistedWord = capitalizeWord(mIsAllUpperCase,
294                mIsFirstCharCapitalized, mWhiteListDictionary.getWhitelistedWord(consideredWord));
295
296        final boolean hasAutoCorrection;
297        if (isCorrectionEnabled) {
298            final CharSequence autoCorrection =
299                    AutoCorrection.computeAutoCorrectionWord(mDictionaries, wordComposer,
300                            mSuggestions, consideredWord, mAutoCorrectionThreshold,
301                            whitelistedWord);
302            hasAutoCorrection = (null != autoCorrection);
303        } else {
304            hasAutoCorrection = false;
305        }
306
307        if (whitelistedWord != null) {
308            if (mTrailingSingleQuotesCount > 0) {
309                final StringBuilder sb = new StringBuilder(whitelistedWord);
310                for (int i = mTrailingSingleQuotesCount - 1; i >= 0; --i) {
311                    sb.appendCodePoint(Keyboard.CODE_SINGLE_QUOTE);
312                }
313                mSuggestions.add(0, new SuggestedWordInfo(sb.toString(),
314                        SuggestedWordInfo.MAX_SCORE, SuggestedWordInfo.KIND_WHITELIST));
315            } else {
316                mSuggestions.add(0, new SuggestedWordInfo(whitelistedWord,
317                        SuggestedWordInfo.MAX_SCORE, SuggestedWordInfo.KIND_WHITELIST));
318            }
319        }
320
321        if (!isPrediction) {
322            mSuggestions.add(0, new SuggestedWordInfo(typedWord, SuggestedWordInfo.MAX_SCORE,
323                    SuggestedWordInfo.KIND_TYPED));
324        }
325        SuggestedWordInfo.removeDups(mSuggestions);
326
327        final ArrayList<SuggestedWordInfo> suggestionsList;
328        if (DBG && !mSuggestions.isEmpty()) {
329            suggestionsList = getSuggestionsInfoListWithDebugInfo(typedWord, mSuggestions);
330        } else {
331            suggestionsList = mSuggestions;
332        }
333
334        // TODO: Change this scheme - a boolean is not enough. A whitelisted word may be "valid"
335        // but still autocorrected from - in the case the whitelist only capitalizes the word.
336        // The whitelist should be case-insensitive, so it's not possible to be consistent with
337        // a boolean flag. Right now this is handled with a slight hack in
338        // WhitelistDictionary#shouldForciblyAutoCorrectFrom.
339        final boolean allowsToBeAutoCorrected = AutoCorrection.allowsToBeAutoCorrected(
340                getUnigramDictionaries(), consideredWord, wordComposer.isFirstCharCapitalized())
341        // If we don't have a main dictionary, we never want to auto-correct. The reason for this
342        // is, the user may have a contact whose name happens to match a valid word in their
343        // language, and it will unexpectedly auto-correct. For example, if the user types in
344        // English with no dictionary and has a "Will" in their contact list, "will" would
345        // always auto-correct to "Will" which is unwanted. Hence, no main dict => no auto-correct.
346                && hasMainDictionary();
347
348        boolean autoCorrectionAvailable = hasAutoCorrection;
349        if (isCorrectionEnabled) {
350            autoCorrectionAvailable |= !allowsToBeAutoCorrected;
351        }
352        // Don't auto-correct words with multiple capital letter
353        autoCorrectionAvailable &= !wordComposer.isMostlyCaps();
354        autoCorrectionAvailable &= !wordComposer.isResumed();
355        if (allowsToBeAutoCorrected && suggestionsList.size() > 1 && mAutoCorrectionThreshold > 0
356                && Suggest.shouldBlockAutoCorrectionBySafetyNet(typedWord,
357                        suggestionsList.get(1).mWord)) {
358            autoCorrectionAvailable = false;
359        }
360        return new SuggestedWords(suggestionsList,
361                !isPrediction && !allowsToBeAutoCorrected /* typedWordValid */,
362                !isPrediction && autoCorrectionAvailable /* hasAutoCorrectionCandidate */,
363                !isPrediction && allowsToBeAutoCorrected /* allowsToBeAutoCorrected */,
364                false /* isPunctuationSuggestions */,
365                false /* isObsoleteSuggestions */,
366                isPrediction);
367    }
368
369    private static ArrayList<SuggestedWordInfo> getSuggestionsInfoListWithDebugInfo(
370            final String typedWord, final ArrayList<SuggestedWordInfo> suggestions) {
371        final SuggestedWordInfo typedWordInfo = suggestions.get(0);
372        typedWordInfo.setDebugString("+");
373        final int suggestionsSize = suggestions.size();
374        final ArrayList<SuggestedWordInfo> suggestionsList =
375                new ArrayList<SuggestedWordInfo>(suggestionsSize);
376        suggestionsList.add(typedWordInfo);
377        // Note: i here is the index in mScores[], but the index in mSuggestions is one more
378        // than i because we added the typed word to mSuggestions without touching mScores.
379        for (int i = 0; i < suggestionsSize - 1; ++i) {
380            final SuggestedWordInfo cur = suggestions.get(i + 1);
381            final float normalizedScore = BinaryDictionary.calcNormalizedScore(
382                    typedWord, cur.toString(), cur.mScore);
383            final String scoreInfoString;
384            if (normalizedScore > 0) {
385                scoreInfoString = String.format("%d (%4.2f)", cur.mScore, normalizedScore);
386            } else {
387                scoreInfoString = Integer.toString(cur.mScore);
388            }
389            cur.setDebugString(scoreInfoString);
390            suggestionsList.add(cur);
391        }
392        return suggestionsList;
393    }
394
395    // TODO: Use codepoint instead of char
396    public boolean addWord(final char[] word, int[] indices, final int offset, final int length,
397            int score, final int dicTypeId, final int dataType,
398            final ArrayList<SuggestedWordInfo> suggestions) {
399        int dataTypeForLog = dataType;
400        final int prefMaxSuggestions = MAX_SUGGESTIONS;
401
402        int pos = 0;
403
404        // Check if it's the same word, only caps are different
405        if (StringUtils.equalsIgnoreCase(mConsideredWord, word, offset, length)) {
406            // TODO: remove this surrounding if clause and move this logic to
407            // getSuggestedWordBuilder.
408            if (suggestions.size() > 0) {
409                final SuggestedWordInfo currentHighestWord = suggestions.get(0);
410                // If the current highest word is also equal to typed word, we need to compare
411                // frequency to determine the insertion position. This does not ensure strictly
412                // correct ordering, but ensures the top score is on top which is enough for
413                // removing duplicates correctly.
414                if (StringUtils.equalsIgnoreCase(currentHighestWord.mWord, word, offset, length)
415                        && score <= currentHighestWord.mScore) {
416                    pos = 1;
417                }
418            }
419        } else {
420            // Check the last one's score and bail
421            if (suggestions.size() >= prefMaxSuggestions
422                    && suggestions.get(prefMaxSuggestions - 1).mScore >= score) return true;
423            while (pos < suggestions.size()) {
424                final int curScore = suggestions.get(pos).mScore;
425                if (curScore < score
426                        || (curScore == score && length < suggestions.get(pos).codePointCount())) {
427                    break;
428                }
429                pos++;
430            }
431        }
432        if (pos >= prefMaxSuggestions) {
433            return true;
434        }
435
436        final StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
437        // TODO: Must pay attention to locale when changing case.
438        if (mIsAllUpperCase) {
439            sb.append(new String(word, offset, length).toUpperCase());
440        } else if (mIsFirstCharCapitalized) {
441            sb.append(Character.toUpperCase(word[offset]));
442            if (length > 1) {
443                sb.append(word, offset + 1, length - 1);
444            }
445        } else {
446            sb.append(word, offset, length);
447        }
448        for (int i = mTrailingSingleQuotesCount - 1; i >= 0; --i) {
449            sb.appendCodePoint(Keyboard.CODE_SINGLE_QUOTE);
450        }
451        // TODO: figure out what type of suggestion this is
452        suggestions.add(pos, new SuggestedWordInfo(sb, score, SuggestedWordInfo.KIND_CORRECTION));
453        if (suggestions.size() > prefMaxSuggestions) {
454            suggestions.remove(prefMaxSuggestions);
455        } else {
456            LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog);
457        }
458        return true;
459    }
460
461    public void close() {
462        final HashSet<Dictionary> dictionaries = new HashSet<Dictionary>();
463        dictionaries.addAll(mDictionaries.values());
464        for (final Dictionary dictionary : dictionaries) {
465            dictionary.close();
466        }
467        mMainDictionary = null;
468    }
469
470    // TODO: Resolve the inconsistencies between the native auto correction algorithms and
471    // this safety net
472    public static boolean shouldBlockAutoCorrectionBySafetyNet(final String typedWord,
473            final CharSequence suggestion) {
474        // Safety net for auto correction.
475        // Actually if we hit this safety net, it's a bug.
476        // If user selected aggressive auto correction mode, there is no need to use the safety
477        // net.
478        // If the length of typed word is less than MINIMUM_SAFETY_NET_CHAR_LENGTH,
479        // we should not use net because relatively edit distance can be big.
480        final int typedWordLength = typedWord.length();
481        if (typedWordLength < Suggest.MINIMUM_SAFETY_NET_CHAR_LENGTH) {
482            return false;
483        }
484        final int maxEditDistanceOfNativeDictionary =
485                (typedWordLength < 5 ? 2 : typedWordLength / 2) + 1;
486        final int distance = BinaryDictionary.editDistance(typedWord, suggestion.toString());
487        if (DBG) {
488            Log.d(TAG, "Autocorrected edit distance = " + distance
489                    + ", " + maxEditDistanceOfNativeDictionary);
490        }
491        if (distance > maxEditDistanceOfNativeDictionary) {
492            if (DBG) {
493                Log.e(TAG, "Safety net: before = " + typedWord + ", after = " + suggestion);
494                Log.e(TAG, "(Error) The edit distance of this correction exceeds limit. "
495                        + "Turning off auto-correction.");
496            }
497            return true;
498        } else {
499            return false;
500        }
501    }
502}
503