1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.content.Context;
20import android.text.TextUtils;
21import android.util.Log;
22
23import com.android.inputmethod.keyboard.ProximityInfo;
24
25import java.io.File;
26import java.util.ArrayList;
27import java.util.Arrays;
28import java.util.HashMap;
29import java.util.HashSet;
30import java.util.Locale;
31import java.util.Map;
32import java.util.Set;
33
34/**
35 * This class loads a dictionary and provides a list of suggestions for a given sequence of
36 * characters. This includes corrections and completions.
37 */
38public class Suggest implements Dictionary.WordCallback {
39
40    public static final String TAG = Suggest.class.getSimpleName();
41
42    public static final int APPROX_MAX_WORD_LENGTH = 32;
43
44    public static final int CORRECTION_NONE = 0;
45    public static final int CORRECTION_BASIC = 1;
46    public static final int CORRECTION_FULL = 2;
47    public static final int CORRECTION_FULL_BIGRAM = 3;
48
49    /**
50     * Words that appear in both bigram and unigram data gets multiplier ranging from
51     * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the score from
52     * bigram data.
53     */
54    public static final double BIGRAM_MULTIPLIER_MIN = 1.2;
55    public static final double BIGRAM_MULTIPLIER_MAX = 1.5;
56
57    /**
58     * Maximum possible bigram frequency. Will depend on how many bits are being used in data
59     * structure. Maximum bigram frequency will get the BIGRAM_MULTIPLIER_MAX as the multiplier.
60     */
61    public static final int MAXIMUM_BIGRAM_FREQUENCY = 127;
62
63    // It seems the following values are only used for logging.
64    public static final int DIC_USER_TYPED = 0;
65    public static final int DIC_MAIN = 1;
66    public static final int DIC_USER = 2;
67    public static final int DIC_USER_UNIGRAM = 3;
68    public static final int DIC_CONTACTS = 4;
69    public static final int DIC_USER_BIGRAM = 5;
70    public static final int DIC_WHITELIST = 6;
71    // If you add a type of dictionary, increment DIC_TYPE_LAST_ID
72    // TODO: this value seems unused. Remove it?
73    public static final int DIC_TYPE_LAST_ID = 6;
74    public static final String DICT_KEY_MAIN = "main";
75    public static final String DICT_KEY_CONTACTS = "contacts";
76    // User dictionary, the system-managed one.
77    public static final String DICT_KEY_USER = "user";
78    // User unigram dictionary, internal to LatinIME
79    public static final String DICT_KEY_USER_UNIGRAM = "user_unigram";
80    // User bigram dictionary, internal to LatinIME
81    public static final String DICT_KEY_USER_BIGRAM = "user_bigram";
82    public static final String DICT_KEY_WHITELIST ="whitelist";
83
84    private static final boolean DBG = LatinImeLogger.sDBG;
85
86    private AutoCorrection mAutoCorrection;
87
88    private Dictionary mMainDict;
89    private ContactsDictionary mContactsDict;
90    private WhitelistDictionary mWhiteListDictionary;
91    private final Map<String, Dictionary> mUnigramDictionaries = new HashMap<String, Dictionary>();
92    private final Map<String, Dictionary> mBigramDictionaries = new HashMap<String, Dictionary>();
93
94    private int mPrefMaxSuggestions = 18;
95
96    private static final int PREF_MAX_BIGRAMS = 60;
97
98    private double mAutoCorrectionThreshold;
99    private int[] mScores = new int[mPrefMaxSuggestions];
100    private int[] mBigramScores = new int[PREF_MAX_BIGRAMS];
101
102    private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>();
103    ArrayList<CharSequence> mBigramSuggestions  = new ArrayList<CharSequence>();
104    private CharSequence mTypedWord;
105
106    // TODO: Remove these member variables by passing more context to addWord() callback method
107    private boolean mIsFirstCharCapitalized;
108    private boolean mIsAllUpperCase;
109
110    private int mCorrectionMode = CORRECTION_BASIC;
111
112    public Suggest(final Context context, final int dictionaryResId, final Locale locale) {
113        initAsynchronously(context, dictionaryResId, locale);
114    }
115
116    /* package for test */ Suggest(final Context context, final File dictionary,
117            final long startOffset, final long length, final Flag[] flagArray,
118            final Locale locale) {
119        initSynchronously(null, DictionaryFactory.createDictionaryForTest(context, dictionary,
120                startOffset, length, flagArray), locale);
121    }
122
123    private void initWhitelistAndAutocorrectAndPool(final Context context, final Locale locale) {
124        mWhiteListDictionary = new WhitelistDictionary(context, locale);
125        addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_WHITELIST, mWhiteListDictionary);
126        mAutoCorrection = new AutoCorrection();
127        StringBuilderPool.ensureCapacity(mPrefMaxSuggestions, getApproxMaxWordLength());
128    }
129
130    private void initAsynchronously(final Context context, final int dictionaryResId,
131            final Locale locale) {
132        resetMainDict(context, dictionaryResId, locale);
133
134        // TODO: read the whitelist and init the pool asynchronously too.
135        // initPool should be done asynchronously now that the pool is thread-safe.
136        initWhitelistAndAutocorrectAndPool(context, locale);
137    }
138
139    private void initSynchronously(final Context context, final Dictionary mainDict,
140            final Locale locale) {
141        mMainDict = mainDict;
142        addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_MAIN, mainDict);
143        addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_MAIN, mainDict);
144        initWhitelistAndAutocorrectAndPool(context, locale);
145    }
146
147    private void addOrReplaceDictionary(Map<String, Dictionary> dictionaries, String key,
148            Dictionary dict) {
149        final Dictionary oldDict = (dict == null)
150                ? dictionaries.remove(key)
151                : dictionaries.put(key, dict);
152        if (oldDict != null && dict != oldDict) {
153            oldDict.close();
154        }
155    }
156
157    public void resetMainDict(final Context context, final int dictionaryResId,
158            final Locale locale) {
159        mMainDict = null;
160        new Thread("InitializeBinaryDictionary") {
161            @Override
162            public void run() {
163                final Dictionary newMainDict = DictionaryFactory.createDictionaryFromManager(
164                        context, locale, dictionaryResId);
165                mMainDict = newMainDict;
166                addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_MAIN, newMainDict);
167                addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_MAIN, newMainDict);
168            }
169        }.start();
170    }
171
172    public int getCorrectionMode() {
173        return mCorrectionMode;
174    }
175
176    public void setCorrectionMode(int mode) {
177        mCorrectionMode = mode;
178    }
179
180    // The main dictionary could have been loaded asynchronously.  Don't cache the return value
181    // of this method.
182    public boolean hasMainDictionary() {
183        return mMainDict != null;
184    }
185
186    public ContactsDictionary getContactsDictionary() {
187        return mContactsDict;
188    }
189
190    public Map<String, Dictionary> getUnigramDictionaries() {
191        return mUnigramDictionaries;
192    }
193
194    public int getApproxMaxWordLength() {
195        return APPROX_MAX_WORD_LENGTH;
196    }
197
198    /**
199     * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted
200     * before the main dictionary, if set. This refers to the system-managed user dictionary.
201     */
202    public void setUserDictionary(Dictionary userDictionary) {
203        addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_USER, userDictionary);
204    }
205
206    /**
207     * Sets an optional contacts dictionary resource to be loaded. It is also possible to remove
208     * the contacts dictionary by passing null to this method. In this case no contacts dictionary
209     * won't be used.
210     */
211    public void setContactsDictionary(ContactsDictionary contactsDictionary) {
212        mContactsDict = contactsDictionary;
213        addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_CONTACTS, contactsDictionary);
214        addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_CONTACTS, contactsDictionary);
215    }
216
217    public void setUserUnigramDictionary(Dictionary userUnigramDictionary) {
218        addOrReplaceDictionary(mUnigramDictionaries, DICT_KEY_USER_UNIGRAM, userUnigramDictionary);
219    }
220
221    public void setUserBigramDictionary(Dictionary userBigramDictionary) {
222        addOrReplaceDictionary(mBigramDictionaries, DICT_KEY_USER_BIGRAM, userBigramDictionary);
223    }
224
225    public void setAutoCorrectionThreshold(double threshold) {
226        mAutoCorrectionThreshold = threshold;
227    }
228
229    public boolean isAggressiveAutoCorrectionMode() {
230        return (mAutoCorrectionThreshold == 0);
231    }
232
233    /**
234     * Number of suggestions to generate from the input key sequence. This has
235     * to be a number between 1 and 100 (inclusive).
236     * @param maxSuggestions
237     * @throws IllegalArgumentException if the number is out of range
238     */
239    public void setMaxSuggestions(int maxSuggestions) {
240        if (maxSuggestions < 1 || maxSuggestions > 100) {
241            throw new IllegalArgumentException("maxSuggestions must be between 1 and 100");
242        }
243        mPrefMaxSuggestions = maxSuggestions;
244        mScores = new int[mPrefMaxSuggestions];
245        mBigramScores = new int[PREF_MAX_BIGRAMS];
246        collectGarbage(mSuggestions, mPrefMaxSuggestions);
247        StringBuilderPool.ensureCapacity(mPrefMaxSuggestions, getApproxMaxWordLength());
248    }
249
250    /**
251     * Returns a object which represents suggested words that match the list of character codes
252     * passed in. This object contents will be overwritten the next time this function is called.
253     * @param wordComposer contains what is currently being typed
254     * @param prevWordForBigram previous word (used only for bigram)
255     * @return suggested words object.
256     */
257    public SuggestedWords getSuggestions(final WordComposer wordComposer,
258            final CharSequence prevWordForBigram, final ProximityInfo proximityInfo) {
259        return getSuggestedWordBuilder(wordComposer, prevWordForBigram,
260                proximityInfo).build();
261    }
262
263    private CharSequence capitalizeWord(boolean all, boolean first, CharSequence word) {
264        if (TextUtils.isEmpty(word) || !(all || first)) return word;
265        final int wordLength = word.length();
266        final StringBuilder sb = StringBuilderPool.getStringBuilder(getApproxMaxWordLength());
267        // TODO: Must pay attention to locale when changing case.
268        if (all) {
269            sb.append(word.toString().toUpperCase());
270        } else if (first) {
271            sb.append(Character.toUpperCase(word.charAt(0)));
272            if (wordLength > 1) {
273                sb.append(word.subSequence(1, wordLength));
274            }
275        }
276        return sb;
277    }
278
279    protected void addBigramToSuggestions(CharSequence bigram) {
280        // TODO: Try to be a little more shrewd with resource allocation.
281        // At the moment we copy this object because the StringBuilders are pooled (see
282        // StringBuilderPool.java) and when we are finished using mSuggestions and
283        // mBigramSuggestions we will take everything from both and insert them back in the
284        // pool, so we can't allow the same object to be in both lists at the same time.
285        final StringBuilder sb = StringBuilderPool.getStringBuilder(getApproxMaxWordLength());
286        sb.append(bigram);
287        mSuggestions.add(sb);
288    }
289
290    // TODO: cleanup dictionaries looking up and suggestions building with SuggestedWords.Builder
291    public SuggestedWords.Builder getSuggestedWordBuilder(
292            final WordComposer wordComposer, CharSequence prevWordForBigram,
293            final ProximityInfo proximityInfo) {
294        LatinImeLogger.onStartSuggestion(prevWordForBigram);
295        mAutoCorrection.init();
296        mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized();
297        mIsAllUpperCase = wordComposer.isAllUpperCase();
298        collectGarbage(mSuggestions, mPrefMaxSuggestions);
299        Arrays.fill(mScores, 0);
300
301        // Save a lowercase version of the original word
302        String typedWord = wordComposer.getTypedWord();
303        if (typedWord != null) {
304            // Treating USER_TYPED as UNIGRAM suggestion for logging now.
305            LatinImeLogger.onAddSuggestedWord(typedWord, Suggest.DIC_USER_TYPED,
306                    Dictionary.DataType.UNIGRAM);
307        }
308        mTypedWord = typedWord;
309
310        if (wordComposer.size() <= 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM
311                || mCorrectionMode == CORRECTION_BASIC)) {
312            // At first character typed, search only the bigrams
313            Arrays.fill(mBigramScores, 0);
314            collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS);
315
316            if (!TextUtils.isEmpty(prevWordForBigram)) {
317                CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase();
318                if (mMainDict != null && mMainDict.isValidWord(lowerPrevWord)) {
319                    prevWordForBigram = lowerPrevWord;
320                }
321                for (final Dictionary dictionary : mBigramDictionaries.values()) {
322                    dictionary.getBigrams(wordComposer, prevWordForBigram, this);
323                }
324                if (TextUtils.isEmpty(typedWord)) {
325                    // Nothing entered: return all bigrams for the previous word
326                    int insertCount = Math.min(mBigramSuggestions.size(), mPrefMaxSuggestions);
327                    for (int i = 0; i < insertCount; ++i) {
328                        addBigramToSuggestions(mBigramSuggestions.get(i));
329                    }
330                } else {
331                    // Word entered: return only bigrams that match the first char of the typed word
332                    @SuppressWarnings("null")
333                    final char currentChar = typedWord.charAt(0);
334                    // TODO: Must pay attention to locale when changing case.
335                    final char currentCharUpper = Character.toUpperCase(currentChar);
336                    int count = 0;
337                    final int bigramSuggestionSize = mBigramSuggestions.size();
338                    for (int i = 0; i < bigramSuggestionSize; i++) {
339                        final CharSequence bigramSuggestion = mBigramSuggestions.get(i);
340                        final char bigramSuggestionFirstChar = bigramSuggestion.charAt(0);
341                        if (bigramSuggestionFirstChar == currentChar
342                                || bigramSuggestionFirstChar == currentCharUpper) {
343                            addBigramToSuggestions(bigramSuggestion);
344                            if (++count > mPrefMaxSuggestions) break;
345                        }
346                    }
347                }
348            }
349
350        } else if (wordComposer.size() > 1) {
351            // At second character typed, search the unigrams (scores being affected by bigrams)
352            for (final String key : mUnigramDictionaries.keySet()) {
353                // Skip UserUnigramDictionary and WhitelistDictionary to lookup
354                if (key.equals(DICT_KEY_USER_UNIGRAM) || key.equals(DICT_KEY_WHITELIST))
355                    continue;
356                final Dictionary dictionary = mUnigramDictionaries.get(key);
357                dictionary.getWords(wordComposer, this, proximityInfo);
358            }
359        }
360        final String typedWordString = typedWord == null ? null : typedWord.toString();
361
362        CharSequence whitelistedWord = capitalizeWord(mIsAllUpperCase, mIsFirstCharCapitalized,
363                mWhiteListDictionary.getWhitelistedWord(typedWordString));
364
365        mAutoCorrection.updateAutoCorrectionStatus(mUnigramDictionaries, wordComposer,
366                mSuggestions, mScores, typedWord, mAutoCorrectionThreshold, mCorrectionMode,
367                whitelistedWord);
368
369        if (whitelistedWord != null) {
370            mSuggestions.add(0, whitelistedWord);
371        }
372
373        if (typedWord != null) {
374            mSuggestions.add(0, typedWordString);
375        }
376        Utils.removeDupes(mSuggestions);
377
378        if (DBG) {
379            double normalizedScore = mAutoCorrection.getNormalizedScore();
380            ArrayList<SuggestedWords.SuggestedWordInfo> scoreInfoList =
381                    new ArrayList<SuggestedWords.SuggestedWordInfo>();
382            scoreInfoList.add(new SuggestedWords.SuggestedWordInfo("+", false));
383            for (int i = 0; i < mScores.length; ++i) {
384                if (normalizedScore > 0) {
385                    final String scoreThreshold = String.format("%d (%4.2f)", mScores[i],
386                            normalizedScore);
387                    scoreInfoList.add(
388                            new SuggestedWords.SuggestedWordInfo(scoreThreshold, false));
389                    normalizedScore = 0.0;
390                } else {
391                    final String score = Integer.toString(mScores[i]);
392                    scoreInfoList.add(new SuggestedWords.SuggestedWordInfo(score, false));
393                }
394            }
395            for (int i = mScores.length; i < mSuggestions.size(); ++i) {
396                scoreInfoList.add(new SuggestedWords.SuggestedWordInfo("--", false));
397            }
398            return new SuggestedWords.Builder().addWords(mSuggestions, scoreInfoList);
399        }
400        return new SuggestedWords.Builder().addWords(mSuggestions, null);
401    }
402
403    public boolean hasAutoCorrection() {
404        return mAutoCorrection.hasAutoCorrection();
405    }
406
407    @Override
408    public boolean addWord(final char[] word, final int offset, final int length, int score,
409            final int dicTypeId, final Dictionary.DataType dataType) {
410        Dictionary.DataType dataTypeForLog = dataType;
411        final ArrayList<CharSequence> suggestions;
412        final int[] sortedScores;
413        final int prefMaxSuggestions;
414        if(dataType == Dictionary.DataType.BIGRAM) {
415            suggestions = mBigramSuggestions;
416            sortedScores = mBigramScores;
417            prefMaxSuggestions = PREF_MAX_BIGRAMS;
418        } else {
419            suggestions = mSuggestions;
420            sortedScores = mScores;
421            prefMaxSuggestions = mPrefMaxSuggestions;
422        }
423
424        int pos = 0;
425
426        // Check if it's the same word, only caps are different
427        if (Utils.equalsIgnoreCase(mTypedWord, word, offset, length)) {
428            // TODO: remove this surrounding if clause and move this logic to
429            // getSuggestedWordBuilder.
430            if (suggestions.size() > 0) {
431                final String currentHighestWord = suggestions.get(0).toString();
432                // If the current highest word is also equal to typed word, we need to compare
433                // frequency to determine the insertion position. This does not ensure strictly
434                // correct ordering, but ensures the top score is on top which is enough for
435                // removing duplicates correctly.
436                if (Utils.equalsIgnoreCase(currentHighestWord, word, offset, length)
437                        && score <= sortedScores[0]) {
438                    pos = 1;
439                }
440            }
441        } else {
442            if (dataType == Dictionary.DataType.UNIGRAM) {
443                // Check if the word was already added before (by bigram data)
444                int bigramSuggestion = searchBigramSuggestion(word,offset,length);
445                if(bigramSuggestion >= 0) {
446                    dataTypeForLog = Dictionary.DataType.BIGRAM;
447                    // turn freq from bigram into multiplier specified above
448                    double multiplier = (((double) mBigramScores[bigramSuggestion])
449                            / MAXIMUM_BIGRAM_FREQUENCY)
450                            * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN)
451                            + BIGRAM_MULTIPLIER_MIN;
452                    /* Log.d(TAG,"bigram num: " + bigramSuggestion
453                            + "  wordB: " + mBigramSuggestions.get(bigramSuggestion).toString()
454                            + "  currentScore: " + score + "  bigramScore: "
455                            + mBigramScores[bigramSuggestion]
456                            + "  multiplier: " + multiplier); */
457                    score = (int)Math.round((score * multiplier));
458                }
459            }
460
461            // Check the last one's score and bail
462            if (sortedScores[prefMaxSuggestions - 1] >= score) return true;
463            while (pos < prefMaxSuggestions) {
464                if (sortedScores[pos] < score
465                        || (sortedScores[pos] == score && length < suggestions.get(pos).length())) {
466                    break;
467                }
468                pos++;
469            }
470        }
471        if (pos >= prefMaxSuggestions) {
472            return true;
473        }
474
475        System.arraycopy(sortedScores, pos, sortedScores, pos + 1, prefMaxSuggestions - pos - 1);
476        sortedScores[pos] = score;
477        final StringBuilder sb = StringBuilderPool.getStringBuilder(getApproxMaxWordLength());
478        // TODO: Must pay attention to locale when changing case.
479        if (mIsAllUpperCase) {
480            sb.append(new String(word, offset, length).toUpperCase());
481        } else if (mIsFirstCharCapitalized) {
482            sb.append(Character.toUpperCase(word[offset]));
483            if (length > 1) {
484                sb.append(word, offset + 1, length - 1);
485            }
486        } else {
487            sb.append(word, offset, length);
488        }
489        suggestions.add(pos, sb);
490        if (suggestions.size() > prefMaxSuggestions) {
491            final CharSequence garbage = suggestions.remove(prefMaxSuggestions);
492            if (garbage instanceof StringBuilder) {
493                StringBuilderPool.recycle((StringBuilder)garbage);
494            }
495        } else {
496            LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog);
497        }
498        return true;
499    }
500
501    private int searchBigramSuggestion(final char[] word, final int offset, final int length) {
502        // TODO This is almost O(n^2). Might need fix.
503        // search whether the word appeared in bigram data
504        int bigramSuggestSize = mBigramSuggestions.size();
505        for(int i = 0; i < bigramSuggestSize; i++) {
506            if(mBigramSuggestions.get(i).length() == length) {
507                boolean chk = true;
508                for(int j = 0; j < length; j++) {
509                    if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) {
510                        chk = false;
511                        break;
512                    }
513                }
514                if(chk) return i;
515            }
516        }
517
518        return -1;
519    }
520
521    private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) {
522        int poolSize = StringBuilderPool.getSize();
523        int garbageSize = suggestions.size();
524        while (poolSize < prefMaxSuggestions && garbageSize > 0) {
525            final CharSequence garbage = suggestions.get(garbageSize - 1);
526            if (garbage instanceof StringBuilder) {
527                StringBuilderPool.recycle((StringBuilder)garbage);
528                poolSize++;
529            }
530            garbageSize--;
531        }
532        if (poolSize == prefMaxSuggestions + 1) {
533            Log.w("Suggest", "String pool got too big: " + poolSize);
534        }
535        suggestions.clear();
536    }
537
538    public void close() {
539        final Set<Dictionary> dictionaries = new HashSet<Dictionary>();
540        dictionaries.addAll(mUnigramDictionaries.values());
541        dictionaries.addAll(mBigramDictionaries.values());
542        for (final Dictionary dictionary : dictionaries) {
543            dictionary.close();
544        }
545        mMainDict = null;
546    }
547}
548