Suggest.java revision 9f67e12a0e3f77985fb8bafe0db4c00e32317b9a
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.content.Context;
20import android.text.AutoText;
21import android.text.TextUtils;
22import android.util.Log;
23import android.view.View;
24
25import java.io.File;
26import java.util.ArrayList;
27import java.util.Arrays;
28
29/**
30 * This class loads a dictionary and provides a list of suggestions for a given sequence of
31 * characters. This includes corrections and completions.
32 */
33public class Suggest implements Dictionary.WordCallback {
34
35    public static final String TAG = Suggest.class.getSimpleName();
36
37    public static final int APPROX_MAX_WORD_LENGTH = 32;
38
39    public static final int CORRECTION_NONE = 0;
40    public static final int CORRECTION_BASIC = 1;
41    public static final int CORRECTION_FULL = 2;
42    public static final int CORRECTION_FULL_BIGRAM = 3;
43
44    /**
45     * Words that appear in both bigram and unigram data gets multiplier ranging from
46     * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the frequency score from
47     * bigram data.
48     */
49    public static final double BIGRAM_MULTIPLIER_MIN = 1.2;
50    public static final double BIGRAM_MULTIPLIER_MAX = 1.5;
51
52    /**
53     * Maximum possible bigram frequency. Will depend on how many bits are being used in data
54     * structure. Maximum bigram freqeuncy will get the BIGRAM_MULTIPLIER_MAX as the multiplier.
55     */
56    public static final int MAXIMUM_BIGRAM_FREQUENCY = 127;
57
58    public static final int DIC_USER_TYPED = 0;
59    public static final int DIC_MAIN = 1;
60    public static final int DIC_USER = 2;
61    public static final int DIC_AUTO = 3;
62    public static final int DIC_CONTACTS = 4;
63    // If you add a type of dictionary, increment DIC_TYPE_LAST_ID
64    public static final int DIC_TYPE_LAST_ID = 4;
65
66    static final int LARGE_DICTIONARY_THRESHOLD = 200 * 1000;
67
68    private static final boolean DBG = LatinImeLogger.sDBG;
69
70    private AutoCorrection mAutoCorrection;
71
72    private BinaryDictionary mMainDict;
73
74    private Dictionary mUserDictionary;
75
76    private Dictionary mAutoDictionary;
77
78    private Dictionary mContactsDictionary;
79
80    private Dictionary mUserBigramDictionary;
81
82    private int mPrefMaxSuggestions = 12;
83
84    private static final int PREF_MAX_BIGRAMS = 60;
85
86    private boolean mQuickFixesEnabled;
87
88    private double mAutoCorrectionThreshold;
89    private int[] mPriorities = new int[mPrefMaxSuggestions];
90    private int[] mBigramPriorities = new int[PREF_MAX_BIGRAMS];
91
92    private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>();
93    ArrayList<CharSequence> mBigramSuggestions  = new ArrayList<CharSequence>();
94    private ArrayList<CharSequence> mStringPool = new ArrayList<CharSequence>();
95    private String mLowerOriginalWord;
96
97    // TODO: Remove these member variables by passing more context to addWord() callback method
98    private boolean mIsFirstCharCapitalized;
99    private boolean mIsAllUpperCase;
100
101    private int mCorrectionMode = CORRECTION_BASIC;
102
103    public Suggest(Context context, int dictionaryResId) {
104        mMainDict = BinaryDictionary.initDictionary(context, dictionaryResId, DIC_MAIN);
105        init();
106    }
107
108    /* package for test */ Suggest(File dictionary, long startOffset, long length) {
109        mMainDict = BinaryDictionary.initDictionary(dictionary, startOffset, length, DIC_MAIN);
110        init();
111    }
112
113    private void init() {
114        mAutoCorrection = new AutoCorrection();
115        initPool();
116    }
117
118    private void initPool() {
119        for (int i = 0; i < mPrefMaxSuggestions; i++) {
120            StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
121            mStringPool.add(sb);
122        }
123    }
124
125    public void setQuickFixesEnabled(boolean enabled) {
126        mQuickFixesEnabled = enabled;
127    }
128
129    public int getCorrectionMode() {
130        return mCorrectionMode;
131    }
132
133    public void setCorrectionMode(int mode) {
134        mCorrectionMode = mode;
135    }
136
137    public boolean hasMainDictionary() {
138        return mMainDict != null && mMainDict.getSize() > LARGE_DICTIONARY_THRESHOLD;
139    }
140
141    public int getApproxMaxWordLength() {
142        return APPROX_MAX_WORD_LENGTH;
143    }
144
145    /**
146     * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted
147     * before the main dictionary, if set.
148     */
149    public void setUserDictionary(Dictionary userDictionary) {
150        mUserDictionary = userDictionary;
151    }
152
153    /**
154     * Sets an optional contacts dictionary resource to be loaded.
155     */
156    public void setContactsDictionary(Dictionary userDictionary) {
157        mContactsDictionary = userDictionary;
158    }
159
160    public void setAutoDictionary(Dictionary autoDictionary) {
161        mAutoDictionary = autoDictionary;
162    }
163
164    public void setUserBigramDictionary(Dictionary userBigramDictionary) {
165        mUserBigramDictionary = userBigramDictionary;
166    }
167
168    public void setAutoCorrectionThreshold(double threshold) {
169        mAutoCorrectionThreshold = threshold;
170    }
171
172    public boolean isAggressiveAutoCorrectionMode() {
173        return (mAutoCorrectionThreshold == 0);
174    }
175
176    /**
177     * Number of suggestions to generate from the input key sequence. This has
178     * to be a number between 1 and 100 (inclusive).
179     * @param maxSuggestions
180     * @throws IllegalArgumentException if the number is out of range
181     */
182    public void setMaxSuggestions(int maxSuggestions) {
183        if (maxSuggestions < 1 || maxSuggestions > 100) {
184            throw new IllegalArgumentException("maxSuggestions must be between 1 and 100");
185        }
186        mPrefMaxSuggestions = maxSuggestions;
187        mPriorities = new int[mPrefMaxSuggestions];
188        mBigramPriorities = new int[PREF_MAX_BIGRAMS];
189        collectGarbage(mSuggestions, mPrefMaxSuggestions);
190        while (mStringPool.size() < mPrefMaxSuggestions) {
191            StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
192            mStringPool.add(sb);
193        }
194    }
195
196    /**
197     * Returns a object which represents suggested words that match the list of character codes
198     * passed in. This object contents will be overwritten the next time this function is called.
199     * @param view a view for retrieving the context for AutoText
200     * @param wordComposer contains what is currently being typed
201     * @param prevWordForBigram previous word (used only for bigram)
202     * @return suggested words object.
203     */
204    public SuggestedWords getSuggestions(View view, WordComposer wordComposer,
205            CharSequence prevWordForBigram) {
206        return getSuggestedWordBuilder(view, wordComposer, prevWordForBigram).build();
207    }
208
209    // TODO: cleanup dictionaries looking up and suggestions building with SuggestedWords.Builder
210    public SuggestedWords.Builder getSuggestedWordBuilder(View view, WordComposer wordComposer,
211            CharSequence prevWordForBigram) {
212        LatinImeLogger.onStartSuggestion(prevWordForBigram);
213        mAutoCorrection.init();
214        mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized();
215        mIsAllUpperCase = wordComposer.isAllUpperCase();
216        collectGarbage(mSuggestions, mPrefMaxSuggestions);
217        Arrays.fill(mPriorities, 0);
218
219        // Save a lowercase version of the original word
220        CharSequence typedWord = wordComposer.getTypedWord();
221        if (typedWord != null) {
222            final String typedWordString = typedWord.toString();
223            typedWord = typedWordString;
224            mLowerOriginalWord = typedWordString.toLowerCase();
225            // Treating USER_TYPED as UNIGRAM suggestion for logging now.
226            LatinImeLogger.onAddSuggestedWord(typedWordString, Suggest.DIC_USER_TYPED,
227                    Dictionary.DataType.UNIGRAM);
228        } else {
229            mLowerOriginalWord = "";
230        }
231
232        if (wordComposer.size() == 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM
233                || mCorrectionMode == CORRECTION_BASIC)) {
234            // At first character typed, search only the bigrams
235            Arrays.fill(mBigramPriorities, 0);
236            collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS);
237
238            if (!TextUtils.isEmpty(prevWordForBigram)) {
239                CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase();
240                if (mMainDict != null && mMainDict.isValidWord(lowerPrevWord)) {
241                    prevWordForBigram = lowerPrevWord;
242                }
243                if (mUserBigramDictionary != null) {
244                    mUserBigramDictionary.getBigrams(wordComposer, prevWordForBigram, this);
245                }
246                if (mContactsDictionary != null) {
247                    mContactsDictionary.getBigrams(wordComposer, prevWordForBigram, this);
248                }
249                if (mMainDict != null) {
250                    mMainDict.getBigrams(wordComposer, prevWordForBigram, this);
251                }
252                char currentChar = wordComposer.getTypedWord().charAt(0);
253                char currentCharUpper = Character.toUpperCase(currentChar);
254                int count = 0;
255                int bigramSuggestionSize = mBigramSuggestions.size();
256                for (int i = 0; i < bigramSuggestionSize; i++) {
257                    if (mBigramSuggestions.get(i).charAt(0) == currentChar
258                            || mBigramSuggestions.get(i).charAt(0) == currentCharUpper) {
259                        int poolSize = mStringPool.size();
260                        StringBuilder sb = poolSize > 0 ?
261                                (StringBuilder) mStringPool.remove(poolSize - 1)
262                                : new StringBuilder(getApproxMaxWordLength());
263                        sb.setLength(0);
264                        sb.append(mBigramSuggestions.get(i));
265                        mSuggestions.add(count++, sb);
266                        if (count > mPrefMaxSuggestions) break;
267                    }
268                }
269            }
270
271        } else if (wordComposer.size() > 1) {
272            // At second character typed, search the unigrams (scores being affected by bigrams)
273            if (mUserDictionary != null || mContactsDictionary != null) {
274                if (mUserDictionary != null) {
275                    mUserDictionary.getWords(wordComposer, this);
276                }
277                if (mContactsDictionary != null) {
278                    mContactsDictionary.getWords(wordComposer, this);
279                }
280            }
281            if (mMainDict != null) mMainDict.getWords(wordComposer, this);
282        }
283        CharSequence autoText = null;
284        final String typedWordString = typedWord == null ? null : typedWord.toString();
285        if (typedWord != null) {
286            // Apply quick fix only for the typed word.
287            if (mQuickFixesEnabled) {
288                final String lowerCaseTypedWord = typedWordString.toLowerCase();
289                CharSequence tempAutoText =
290                        AutoText.get(lowerCaseTypedWord, 0, lowerCaseTypedWord.length(), view);
291                // Is there an AutoText (also known as Quick Fixes) correction?
292                // Capitalize as needed
293                if (!TextUtils.isEmpty(tempAutoText)
294                        && (mIsAllUpperCase || mIsFirstCharCapitalized)) {
295                    final int tempAutoTextLength = tempAutoText.length();
296                    final int poolSize = mStringPool.size();
297                    final StringBuilder sb =
298                            poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1)
299                                    : new StringBuilder(getApproxMaxWordLength());
300                    sb.setLength(0);
301                    if (mIsAllUpperCase) {
302                        sb.append(tempAutoText.toString().toUpperCase());
303                    } else if (mIsFirstCharCapitalized) {
304                        sb.append(Character.toUpperCase(tempAutoText.charAt(0)));
305                        if (tempAutoTextLength > 1) {
306                            sb.append(tempAutoText.subSequence(1, tempAutoTextLength));
307                        }
308                    }
309                    tempAutoText = sb.toString();
310                }
311                boolean canAdd = tempAutoText != null;
312                // Is that correction already the current prediction (or original word)?
313                canAdd &= !TextUtils.equals(tempAutoText, typedWord);
314                // Is that correction already the next predicted word?
315                if (canAdd && mSuggestions.size() > 0 && mCorrectionMode != CORRECTION_BASIC) {
316                    canAdd &= !TextUtils.equals(tempAutoText, mSuggestions.get(0));
317                }
318                if (canAdd) {
319                    if (DBG) {
320                        Log.d(TAG, "Auto corrected by AUTOTEXT.");
321                    }
322                    autoText = tempAutoText;
323                }
324            }
325        }
326
327        mAutoCorrection.updateAutoCorrectionStatus(this, wordComposer, mSuggestions, mPriorities,
328                typedWord, mAutoCorrectionThreshold, mCorrectionMode, autoText);
329
330        if (autoText != null) {
331            mSuggestions.add(0, autoText);
332        }
333
334        if (typedWord != null) {
335            mSuggestions.add(0, typedWordString);
336        }
337        removeDupes();
338
339        if (DBG) {
340            double normalizedScore = mAutoCorrection.getNormalizedScore();
341            ArrayList<SuggestedWords.SuggestedWordInfo> frequencyInfoList =
342                    new ArrayList<SuggestedWords.SuggestedWordInfo>();
343            frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo("+", false));
344            final int priorityLength = mPriorities.length;
345            for (int i = 0; i < priorityLength; ++i) {
346                if (normalizedScore > 0) {
347                    final String priorityThreshold = Integer.toString(mPriorities[i]) + " (" +
348                            normalizedScore + ")";
349                    frequencyInfoList.add(
350                            new SuggestedWords.SuggestedWordInfo(priorityThreshold, false));
351                    normalizedScore = 0.0;
352                } else {
353                    final String priority = Integer.toString(mPriorities[i]);
354                    frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo(priority, false));
355                }
356            }
357            for (int i = priorityLength; i < mSuggestions.size(); ++i) {
358                frequencyInfoList.add(new SuggestedWords.SuggestedWordInfo("--", false));
359            }
360            return new SuggestedWords.Builder().addWords(mSuggestions, frequencyInfoList);
361        }
362        return new SuggestedWords.Builder().addWords(mSuggestions, null);
363    }
364
365    private void removeDupes() {
366        final ArrayList<CharSequence> suggestions = mSuggestions;
367        if (suggestions.size() < 2) return;
368        int i = 1;
369        // Don't cache suggestions.size(), since we may be removing items
370        while (i < suggestions.size()) {
371            final CharSequence cur = suggestions.get(i);
372            // Compare each candidate with each previous candidate
373            for (int j = 0; j < i; j++) {
374                CharSequence previous = suggestions.get(j);
375                if (TextUtils.equals(cur, previous)) {
376                    removeFromSuggestions(i);
377                    i--;
378                    break;
379                }
380            }
381            i++;
382        }
383    }
384
385    private void removeFromSuggestions(int index) {
386        CharSequence garbage = mSuggestions.remove(index);
387        if (garbage != null && garbage instanceof StringBuilder) {
388            mStringPool.add(garbage);
389        }
390    }
391
392    public boolean hasAutoCorrection() {
393        return mAutoCorrection.hasAutoCorrection();
394    }
395
396    private static boolean compareCaseInsensitive(final String lowerOriginalWord,
397            final char[] word, final int offset, final int length) {
398        final int originalLength = lowerOriginalWord.length();
399        if (originalLength == length && Character.isUpperCase(word[offset])) {
400            for (int i = 0; i < originalLength; i++) {
401                if (lowerOriginalWord.charAt(i) != Character.toLowerCase(word[offset+i])) {
402                    return false;
403                }
404            }
405            return true;
406        }
407        return false;
408    }
409
410    @Override
411    public boolean addWord(final char[] word, final int offset, final int length, int freq,
412            final int dicTypeId, final Dictionary.DataType dataType) {
413        Dictionary.DataType dataTypeForLog = dataType;
414        ArrayList<CharSequence> suggestions;
415        int[] priorities;
416        int prefMaxSuggestions;
417        if(dataType == Dictionary.DataType.BIGRAM) {
418            suggestions = mBigramSuggestions;
419            priorities = mBigramPriorities;
420            prefMaxSuggestions = PREF_MAX_BIGRAMS;
421        } else {
422            suggestions = mSuggestions;
423            priorities = mPriorities;
424            prefMaxSuggestions = mPrefMaxSuggestions;
425        }
426
427        int pos = 0;
428
429        // Check if it's the same word, only caps are different
430        if (compareCaseInsensitive(mLowerOriginalWord, word, offset, length)) {
431            pos = 0;
432        } else {
433            if (dataType == Dictionary.DataType.UNIGRAM) {
434                // Check if the word was already added before (by bigram data)
435                int bigramSuggestion = searchBigramSuggestion(word,offset,length);
436                if(bigramSuggestion >= 0) {
437                    dataTypeForLog = Dictionary.DataType.BIGRAM;
438                    // turn freq from bigram into multiplier specified above
439                    double multiplier = (((double) mBigramPriorities[bigramSuggestion])
440                            / MAXIMUM_BIGRAM_FREQUENCY)
441                            * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN)
442                            + BIGRAM_MULTIPLIER_MIN;
443                    /* Log.d(TAG,"bigram num: " + bigramSuggestion
444                            + "  wordB: " + mBigramSuggestions.get(bigramSuggestion).toString()
445                            + "  currentPriority: " + freq + "  bigramPriority: "
446                            + mBigramPriorities[bigramSuggestion]
447                            + "  multiplier: " + multiplier); */
448                    freq = (int)Math.round((freq * multiplier));
449                }
450            }
451
452            // Check the last one's priority and bail
453            if (priorities[prefMaxSuggestions - 1] >= freq) return true;
454            while (pos < prefMaxSuggestions) {
455                if (priorities[pos] < freq
456                        || (priorities[pos] == freq && length < suggestions.get(pos).length())) {
457                    break;
458                }
459                pos++;
460            }
461        }
462        if (pos >= prefMaxSuggestions) {
463            return true;
464        }
465
466        System.arraycopy(priorities, pos, priorities, pos + 1, prefMaxSuggestions - pos - 1);
467        priorities[pos] = freq;
468        int poolSize = mStringPool.size();
469        StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1)
470                : new StringBuilder(getApproxMaxWordLength());
471        sb.setLength(0);
472        if (mIsAllUpperCase) {
473            sb.append(new String(word, offset, length).toUpperCase());
474        } else if (mIsFirstCharCapitalized) {
475            sb.append(Character.toUpperCase(word[offset]));
476            if (length > 1) {
477                sb.append(word, offset + 1, length - 1);
478            }
479        } else {
480            sb.append(word, offset, length);
481        }
482        suggestions.add(pos, sb);
483        if (suggestions.size() > prefMaxSuggestions) {
484            CharSequence garbage = suggestions.remove(prefMaxSuggestions);
485            if (garbage instanceof StringBuilder) {
486                mStringPool.add(garbage);
487            }
488        } else {
489            LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog);
490        }
491        return true;
492    }
493
494    private int searchBigramSuggestion(final char[] word, final int offset, final int length) {
495        // TODO This is almost O(n^2). Might need fix.
496        // search whether the word appeared in bigram data
497        int bigramSuggestSize = mBigramSuggestions.size();
498        for(int i = 0; i < bigramSuggestSize; i++) {
499            if(mBigramSuggestions.get(i).length() == length) {
500                boolean chk = true;
501                for(int j = 0; j < length; j++) {
502                    if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) {
503                        chk = false;
504                        break;
505                    }
506                }
507                if(chk) return i;
508            }
509        }
510
511        return -1;
512    }
513
514    public boolean isValidWord(final CharSequence word) {
515        if (word == null || word.length() == 0 || mMainDict == null) {
516            return false;
517        }
518        return mMainDict.isValidWord(word)
519                || (mUserDictionary != null && mUserDictionary.isValidWord(word))
520                || (mAutoDictionary != null && mAutoDictionary.isValidWord(word))
521                || (mContactsDictionary != null && mContactsDictionary.isValidWord(word));
522    }
523
524    private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) {
525        int poolSize = mStringPool.size();
526        int garbageSize = suggestions.size();
527        while (poolSize < prefMaxSuggestions && garbageSize > 0) {
528            CharSequence garbage = suggestions.get(garbageSize - 1);
529            if (garbage != null && garbage instanceof StringBuilder) {
530                mStringPool.add(garbage);
531                poolSize++;
532            }
533            garbageSize--;
534        }
535        if (poolSize == prefMaxSuggestions + 1) {
536            Log.w("Suggest", "String pool got too big: " + poolSize);
537        }
538        suggestions.clear();
539    }
540
541    public void close() {
542        if (mMainDict != null) {
543            mMainDict.close();
544            mMainDict = null;
545        }
546        if (mUserDictionary != null) {
547            mUserDictionary.close();
548            mUserDictionary = null;
549        }
550        if (mUserBigramDictionary != null) {
551            mUserBigramDictionary.close();
552            mUserBigramDictionary = null;
553        }
554        if (mContactsDictionary != null) {
555            mContactsDictionary.close();
556            mContactsDictionary = null;
557        }
558        if (mAutoDictionary != null) {
559            mAutoDictionary.close();
560            mAutoDictionary = null;
561        }
562    }
563}
564