Suggest.java revision 8553b5ec315660ab53dd9234e64e1e39ea09ec0f
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.content.Context;
20import android.text.AutoText;
21import android.text.TextUtils;
22import android.util.Log;
23import android.view.View;
24
25import java.util.ArrayList;
26import java.util.Arrays;
27
28/**
29 * This class loads a dictionary and provides a list of suggestions for a given sequence of
30 * characters. This includes corrections and completions.
31 */
32public class Suggest implements Dictionary.WordCallback {
33
34    public static final String TAG = Suggest.class.getSimpleName();
35
36    public static final int APPROX_MAX_WORD_LENGTH = 32;
37
38    public static final int CORRECTION_NONE = 0;
39    public static final int CORRECTION_BASIC = 1;
40    public static final int CORRECTION_FULL = 2;
41    public static final int CORRECTION_FULL_BIGRAM = 3;
42
43    /**
44     * Words that appear in both bigram and unigram data gets multiplier ranging from
45     * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the frequency score from
46     * bigram data.
47     */
48    public static final double BIGRAM_MULTIPLIER_MIN = 1.2;
49    public static final double BIGRAM_MULTIPLIER_MAX = 1.5;
50
51    /**
52     * Maximum possible bigram frequency. Will depend on how many bits are being used in data
53     * structure. Maximum bigram freqeuncy will get the BIGRAM_MULTIPLIER_MAX as the multiplier.
54     */
55    public static final int MAXIMUM_BIGRAM_FREQUENCY = 127;
56
57    public static final int DIC_USER_TYPED = 0;
58    public static final int DIC_MAIN = 1;
59    public static final int DIC_USER = 2;
60    public static final int DIC_AUTO = 3;
61    public static final int DIC_CONTACTS = 4;
62    // If you add a type of dictionary, increment DIC_TYPE_LAST_ID
63    public static final int DIC_TYPE_LAST_ID = 4;
64
65    static final int LARGE_DICTIONARY_THRESHOLD = 200 * 1000;
66
67    private static final boolean DBG = LatinImeLogger.sDBG;
68
69    private BinaryDictionary mMainDict;
70
71    private Dictionary mUserDictionary;
72
73    private Dictionary mAutoDictionary;
74
75    private Dictionary mContactsDictionary;
76
77    private Dictionary mUserBigramDictionary;
78
79    private int mPrefMaxSuggestions = 12;
80
81    private static final int PREF_MAX_BIGRAMS = 60;
82
83    private boolean mAutoTextEnabled;
84
85    private double mAutoCorrectionThreshold;
86    private int[] mPriorities = new int[mPrefMaxSuggestions];
87    private int[] mBigramPriorities = new int[PREF_MAX_BIGRAMS];
88
89    // Handle predictive correction for only the first 1280 characters for performance reasons
90    // If we support scripts that need latin characters beyond that, we should probably use some
91    // kind of a sparse array or language specific list with a mapping lookup table.
92    // 1280 is the size of the BASE_CHARS array in ExpandableDictionary, which is a basic set of
93    // latin characters.
94    private int[] mNextLettersFrequencies = new int[1280];
95    private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>();
96    ArrayList<CharSequence> mBigramSuggestions  = new ArrayList<CharSequence>();
97    private ArrayList<CharSequence> mStringPool = new ArrayList<CharSequence>();
98    private boolean mHasAutoCorrection;
99    private String mLowerOriginalWord;
100
101    // TODO: Remove these member variables by passing more context to addWord() callback method
102    private boolean mIsFirstCharCapitalized;
103    private boolean mIsAllUpperCase;
104
105    private int mCorrectionMode = CORRECTION_BASIC;
106
107    public Suggest(Context context, int dictionaryResId) {
108        mMainDict = BinaryDictionary.initDictionary(context, dictionaryResId, DIC_MAIN);
109        initPool();
110    }
111
112    private void initPool() {
113        for (int i = 0; i < mPrefMaxSuggestions; i++) {
114            StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
115            mStringPool.add(sb);
116        }
117    }
118
119    public void setAutoTextEnabled(boolean enabled) {
120        mAutoTextEnabled = enabled;
121    }
122
123    public int getCorrectionMode() {
124        return mCorrectionMode;
125    }
126
127    public void setCorrectionMode(int mode) {
128        mCorrectionMode = mode;
129    }
130
131    public boolean hasMainDictionary() {
132        return mMainDict != null && mMainDict.getSize() > LARGE_DICTIONARY_THRESHOLD;
133    }
134
135    public int getApproxMaxWordLength() {
136        return APPROX_MAX_WORD_LENGTH;
137    }
138
139    /**
140     * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted
141     * before the main dictionary, if set.
142     */
143    public void setUserDictionary(Dictionary userDictionary) {
144        mUserDictionary = userDictionary;
145    }
146
147    /**
148     * Sets an optional contacts dictionary resource to be loaded.
149     */
150    public void setContactsDictionary(Dictionary userDictionary) {
151        mContactsDictionary = userDictionary;
152    }
153
154    public void setAutoDictionary(Dictionary autoDictionary) {
155        mAutoDictionary = autoDictionary;
156    }
157
158    public void setUserBigramDictionary(Dictionary userBigramDictionary) {
159        mUserBigramDictionary = userBigramDictionary;
160    }
161
162    public void setAutoCorrectionThreshold(double threshold) {
163        mAutoCorrectionThreshold = threshold;
164    }
165
166    public boolean isAggressiveAutoCorrectionMode() {
167        return (mAutoCorrectionThreshold == 0);
168    }
169
170    /**
171     * Number of suggestions to generate from the input key sequence. This has
172     * to be a number between 1 and 100 (inclusive).
173     * @param maxSuggestions
174     * @throws IllegalArgumentException if the number is out of range
175     */
176    public void setMaxSuggestions(int maxSuggestions) {
177        if (maxSuggestions < 1 || maxSuggestions > 100) {
178            throw new IllegalArgumentException("maxSuggestions must be between 1 and 100");
179        }
180        mPrefMaxSuggestions = maxSuggestions;
181        mPriorities = new int[mPrefMaxSuggestions];
182        mBigramPriorities = new int[PREF_MAX_BIGRAMS];
183        collectGarbage(mSuggestions, mPrefMaxSuggestions);
184        while (mStringPool.size() < mPrefMaxSuggestions) {
185            StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
186            mStringPool.add(sb);
187        }
188    }
189
190    /**
191     * Returns a object which represents suggested words that match the list of character codes
192     * passed in. This object contents will be overwritten the next time this function is called.
193     * @param view a view for retrieving the context for AutoText
194     * @param wordComposer contains what is currently being typed
195     * @param prevWordForBigram previous word (used only for bigram)
196     * @return suggested words object.
197     */
198    public SuggestedWords getSuggestions(View view, WordComposer wordComposer,
199            CharSequence prevWordForBigram) {
200        return getSuggestedWordBuilder(view, wordComposer, prevWordForBigram).build();
201    }
202
203    // TODO: cleanup dictionaries looking up and suggestions building with SuggestedWords.Builder
204    public SuggestedWords.Builder getSuggestedWordBuilder(View view, WordComposer wordComposer,
205            CharSequence prevWordForBigram) {
206        LatinImeLogger.onStartSuggestion(prevWordForBigram);
207        mHasAutoCorrection = false;
208        mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized();
209        mIsAllUpperCase = wordComposer.isAllUpperCase();
210        collectGarbage(mSuggestions, mPrefMaxSuggestions);
211        Arrays.fill(mPriorities, 0);
212        Arrays.fill(mNextLettersFrequencies, 0);
213
214        // Save a lowercase version of the original word
215        CharSequence typedWord = wordComposer.getTypedWord();
216        if (typedWord != null) {
217            final String typedWordString = typedWord.toString();
218            typedWord = typedWordString;
219            mLowerOriginalWord = typedWordString.toLowerCase();
220            // Treating USER_TYPED as UNIGRAM suggestion for logging now.
221            LatinImeLogger.onAddSuggestedWord(typedWordString, Suggest.DIC_USER_TYPED,
222                    Dictionary.DataType.UNIGRAM);
223        } else {
224            mLowerOriginalWord = "";
225        }
226
227        double normalizedScore = Integer.MIN_VALUE;
228        if (wordComposer.size() == 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM
229                || mCorrectionMode == CORRECTION_BASIC)) {
230            // At first character typed, search only the bigrams
231            Arrays.fill(mBigramPriorities, 0);
232            collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS);
233
234            if (!TextUtils.isEmpty(prevWordForBigram)) {
235                CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase();
236                if (mMainDict != null && mMainDict.isValidWord(lowerPrevWord)) {
237                    prevWordForBigram = lowerPrevWord;
238                }
239                if (mUserBigramDictionary != null) {
240                    mUserBigramDictionary.getBigrams(wordComposer, prevWordForBigram, this,
241                            mNextLettersFrequencies);
242                }
243                if (mContactsDictionary != null) {
244                    mContactsDictionary.getBigrams(wordComposer, prevWordForBigram, this,
245                            mNextLettersFrequencies);
246                }
247                if (mMainDict != null) {
248                    mMainDict.getBigrams(wordComposer, prevWordForBigram, this,
249                            mNextLettersFrequencies);
250                }
251                char currentChar = wordComposer.getTypedWord().charAt(0);
252                char currentCharUpper = Character.toUpperCase(currentChar);
253                int count = 0;
254                int bigramSuggestionSize = mBigramSuggestions.size();
255                for (int i = 0; i < bigramSuggestionSize; i++) {
256                    if (mBigramSuggestions.get(i).charAt(0) == currentChar
257                            || mBigramSuggestions.get(i).charAt(0) == currentCharUpper) {
258                        int poolSize = mStringPool.size();
259                        StringBuilder sb = poolSize > 0 ?
260                                (StringBuilder) mStringPool.remove(poolSize - 1)
261                                : new StringBuilder(getApproxMaxWordLength());
262                        sb.setLength(0);
263                        sb.append(mBigramSuggestions.get(i));
264                        mSuggestions.add(count++, sb);
265                        if (count > mPrefMaxSuggestions) break;
266                    }
267                }
268            }
269
270        } else if (wordComposer.size() > 1) {
271            // At second character typed, search the unigrams (scores being affected by bigrams)
272            if (mUserDictionary != null || mContactsDictionary != null) {
273                if (mUserDictionary != null) {
274                    mUserDictionary.getWords(wordComposer, this, mNextLettersFrequencies);
275                }
276                if (mContactsDictionary != null) {
277                    mContactsDictionary.getWords(wordComposer, this, mNextLettersFrequencies);
278                }
279
280                if (mSuggestions.size() > 0 && isValidWord(typedWord)
281                        && (mCorrectionMode == CORRECTION_FULL
282                        || mCorrectionMode == CORRECTION_FULL_BIGRAM)) {
283                    if (DBG) {
284                        Log.d(TAG, "Auto corrected by CORRECTION_FULL.");
285                    }
286                    mHasAutoCorrection = true;
287                }
288            }
289            if (mMainDict != null) mMainDict.getWords(wordComposer, this, mNextLettersFrequencies);
290            if ((mCorrectionMode == CORRECTION_FULL || mCorrectionMode == CORRECTION_FULL_BIGRAM)
291                    && mSuggestions.size() > 0 && mPriorities.length > 0) {
292                // TODO: when the normalized score of the first suggestion is nearly equals to
293                //       the normalized score of the second suggestion, behave less aggressive.
294                normalizedScore = Utils.calcNormalizedScore(
295                        typedWord, mSuggestions.get(0), mPriorities[0]);
296                if (DBG) {
297                    Log.d(TAG, "Normalized " + typedWord + "," + mSuggestions.get(0) + ","
298                            + mPriorities[0] + ", " + normalizedScore
299                            + "(" + mAutoCorrectionThreshold + ")");
300                }
301                if (normalizedScore >= mAutoCorrectionThreshold) {
302                    if (DBG) {
303                        Log.d(TAG, "Auto corrected by S-threthhold.");
304                    }
305                    mHasAutoCorrection = true;
306                }
307            }
308        }
309        if (typedWord != null) {
310            mSuggestions.add(0, typedWord.toString());
311        }
312        if (mAutoTextEnabled) {
313            int i = 0;
314            int max = 6;
315            // Don't autotext the suggestions from the dictionaries
316            if (mCorrectionMode == CORRECTION_BASIC) max = 1;
317            while (i < mSuggestions.size() && i < max) {
318                String suggestedWord = mSuggestions.get(i).toString().toLowerCase();
319                CharSequence autoText =
320                        AutoText.get(suggestedWord, 0, suggestedWord.length(), view);
321                // Is there an AutoText (also known as Quick Fixes) correction?
322                boolean canAdd = autoText != null;
323                // Capitalize as needed
324                final int autoTextLength = autoText != null ? autoText.length() : 0;
325                if (autoTextLength > 0 && (mIsAllUpperCase || mIsFirstCharCapitalized)) {
326                    int poolSize = mStringPool.size();
327                    StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(
328                            poolSize - 1) : new StringBuilder(getApproxMaxWordLength());
329                    sb.setLength(0);
330                    if (mIsAllUpperCase) {
331                        sb.append(autoText.toString().toUpperCase());
332                    } else if (mIsFirstCharCapitalized) {
333                        sb.append(Character.toUpperCase(autoText.charAt(0)));
334                        if (autoTextLength > 1) {
335                            sb.append(autoText.subSequence(1, autoTextLength));
336                        }
337                    }
338                    autoText = sb.toString();
339                }
340                // Is that correction already the current prediction (or original word)?
341                canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i));
342                // Is that correction already the next predicted word?
343                if (canAdd && i + 1 < mSuggestions.size() && mCorrectionMode != CORRECTION_BASIC) {
344                    canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i + 1));
345                }
346                if (canAdd) {
347                    if (DBG) {
348                        Log.d(TAG, "Auto corrected by AUTOTEXT.");
349                    }
350                    mHasAutoCorrection = true;
351                    mSuggestions.add(i + 1, autoText);
352                    i++;
353                }
354                i++;
355            }
356        }
357        removeDupes();
358        if (DBG) {
359            ArrayList<SuggestedWords.SuggestedWordInfo> frequencyInfoList =
360                    new ArrayList<SuggestedWords.SuggestedWordInfo>();
361            frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo("+", false));
362            final int priorityLength = mPriorities.length;
363            for (int i = 0; i < priorityLength; ++i) {
364                if (normalizedScore > 0) {
365                    final String priorityThreshold = Integer.toString(mPriorities[i]) + " (" +
366                            normalizedScore + ")";
367                    frequencyInfoList.add(
368                            new SuggestedWords.SuggestedWordInfo(priorityThreshold, false));
369                    normalizedScore = 0.0;
370                } else {
371                    final String priority = Integer.toString(mPriorities[i]);
372                    frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo(priority, false));
373                }
374            }
375            for (int i = priorityLength; i < mSuggestions.size(); ++i) {
376                frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo("--", false));
377            }
378            return new SuggestedWords.Builder().addWords(mSuggestions, frequencyInfoList);
379        } else {
380            return new SuggestedWords.Builder().addWords(mSuggestions, null);
381        }
382    }
383
384    public int[] getNextLettersFrequencies() {
385        return mNextLettersFrequencies;
386    }
387
388    private void removeDupes() {
389        final ArrayList<CharSequence> suggestions = mSuggestions;
390        if (suggestions.size() < 2) return;
391        int i = 1;
392        // Don't cache suggestions.size(), since we may be removing items
393        while (i < suggestions.size()) {
394            final CharSequence cur = suggestions.get(i);
395            // Compare each candidate with each previous candidate
396            for (int j = 0; j < i; j++) {
397                CharSequence previous = suggestions.get(j);
398                if (TextUtils.equals(cur, previous)) {
399                    removeFromSuggestions(i);
400                    i--;
401                    break;
402                }
403            }
404            i++;
405        }
406    }
407
408    private void removeFromSuggestions(int index) {
409        CharSequence garbage = mSuggestions.remove(index);
410        if (garbage != null && garbage instanceof StringBuilder) {
411            mStringPool.add(garbage);
412        }
413    }
414
415    public boolean hasAutoCorrection() {
416        return mHasAutoCorrection;
417    }
418
419    private boolean compareCaseInsensitive(final String mLowerOriginalWord,
420            final char[] word, final int offset, final int length) {
421        final int originalLength = mLowerOriginalWord.length();
422        if (originalLength == length && Character.isUpperCase(word[offset])) {
423            for (int i = 0; i < originalLength; i++) {
424                if (mLowerOriginalWord.charAt(i) != Character.toLowerCase(word[offset+i])) {
425                    return false;
426                }
427            }
428            return true;
429        }
430        return false;
431    }
432
433    @Override
434    public boolean addWord(final char[] word, final int offset, final int length, int freq,
435            final int dicTypeId, final Dictionary.DataType dataType) {
436        Dictionary.DataType dataTypeForLog = dataType;
437        ArrayList<CharSequence> suggestions;
438        int[] priorities;
439        int prefMaxSuggestions;
440        if(dataType == Dictionary.DataType.BIGRAM) {
441            suggestions = mBigramSuggestions;
442            priorities = mBigramPriorities;
443            prefMaxSuggestions = PREF_MAX_BIGRAMS;
444        } else {
445            suggestions = mSuggestions;
446            priorities = mPriorities;
447            prefMaxSuggestions = mPrefMaxSuggestions;
448        }
449
450        int pos = 0;
451
452        // Check if it's the same word, only caps are different
453        if (compareCaseInsensitive(mLowerOriginalWord, word, offset, length)) {
454            pos = 0;
455        } else {
456            if (dataType == Dictionary.DataType.UNIGRAM) {
457                // Check if the word was already added before (by bigram data)
458                int bigramSuggestion = searchBigramSuggestion(word,offset,length);
459                if(bigramSuggestion >= 0) {
460                    dataTypeForLog = Dictionary.DataType.BIGRAM;
461                    // turn freq from bigram into multiplier specified above
462                    double multiplier = (((double) mBigramPriorities[bigramSuggestion])
463                            / MAXIMUM_BIGRAM_FREQUENCY)
464                            * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN)
465                            + BIGRAM_MULTIPLIER_MIN;
466                    /* Log.d(TAG,"bigram num: " + bigramSuggestion
467                            + "  wordB: " + mBigramSuggestions.get(bigramSuggestion).toString()
468                            + "  currentPriority: " + freq + "  bigramPriority: "
469                            + mBigramPriorities[bigramSuggestion]
470                            + "  multiplier: " + multiplier); */
471                    freq = (int)Math.round((freq * multiplier));
472                }
473            }
474
475            // Check the last one's priority and bail
476            if (priorities[prefMaxSuggestions - 1] >= freq) return true;
477            while (pos < prefMaxSuggestions) {
478                if (priorities[pos] < freq
479                        || (priorities[pos] == freq && length < suggestions.get(pos).length())) {
480                    break;
481                }
482                pos++;
483            }
484        }
485        if (pos >= prefMaxSuggestions) {
486            return true;
487        }
488
489        System.arraycopy(priorities, pos, priorities, pos + 1, prefMaxSuggestions - pos - 1);
490        priorities[pos] = freq;
491        int poolSize = mStringPool.size();
492        StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1)
493                : new StringBuilder(getApproxMaxWordLength());
494        sb.setLength(0);
495        if (mIsAllUpperCase) {
496            sb.append(new String(word, offset, length).toUpperCase());
497        } else if (mIsFirstCharCapitalized) {
498            sb.append(Character.toUpperCase(word[offset]));
499            if (length > 1) {
500                sb.append(word, offset + 1, length - 1);
501            }
502        } else {
503            sb.append(word, offset, length);
504        }
505        suggestions.add(pos, sb);
506        if (suggestions.size() > prefMaxSuggestions) {
507            CharSequence garbage = suggestions.remove(prefMaxSuggestions);
508            if (garbage instanceof StringBuilder) {
509                mStringPool.add(garbage);
510            }
511        } else {
512            LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog);
513        }
514        return true;
515    }
516
517    private int searchBigramSuggestion(final char[] word, final int offset, final int length) {
518        // TODO This is almost O(n^2). Might need fix.
519        // search whether the word appeared in bigram data
520        int bigramSuggestSize = mBigramSuggestions.size();
521        for(int i = 0; i < bigramSuggestSize; i++) {
522            if(mBigramSuggestions.get(i).length() == length) {
523                boolean chk = true;
524                for(int j = 0; j < length; j++) {
525                    if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) {
526                        chk = false;
527                        break;
528                    }
529                }
530                if(chk) return i;
531            }
532        }
533
534        return -1;
535    }
536
537    public boolean isValidWord(final CharSequence word) {
538        if (word == null || word.length() == 0 || mMainDict == null) {
539            return false;
540        }
541        return mMainDict.isValidWord(word)
542                || (mUserDictionary != null && mUserDictionary.isValidWord(word))
543                || (mAutoDictionary != null && mAutoDictionary.isValidWord(word))
544                || (mContactsDictionary != null && mContactsDictionary.isValidWord(word));
545    }
546
547    private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) {
548        int poolSize = mStringPool.size();
549        int garbageSize = suggestions.size();
550        while (poolSize < prefMaxSuggestions && garbageSize > 0) {
551            CharSequence garbage = suggestions.get(garbageSize - 1);
552            if (garbage != null && garbage instanceof StringBuilder) {
553                mStringPool.add(garbage);
554                poolSize++;
555            }
556            garbageSize--;
557        }
558        if (poolSize == prefMaxSuggestions + 1) {
559            Log.w("Suggest", "String pool got too big: " + poolSize);
560        }
561        suggestions.clear();
562    }
563
564    public void close() {
565        if (mMainDict != null) {
566            mMainDict.close();
567            mMainDict = null;
568        }
569        if (mUserDictionary != null) {
570            mUserDictionary.close();
571            mUserDictionary = null;
572        }
573        if (mUserBigramDictionary != null) {
574            mUserBigramDictionary.close();
575            mUserBigramDictionary = null;
576        }
577        if (mContactsDictionary != null) {
578            mContactsDictionary.close();
579            mContactsDictionary = null;
580        }
581        if (mAutoDictionary != null) {
582            mAutoDictionary.close();
583            mAutoDictionary = null;
584        }
585    }
586}
587