BinaryDictionary.java revision 2fa3693c264a4c150ac307d9bb7f6f8f18cc4ffc
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.text.TextUtils;
20import android.util.SparseArray;
21
22import com.android.inputmethod.annotations.UsedForTesting;
23import com.android.inputmethod.keyboard.ProximityInfo;
24import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
25import com.android.inputmethod.latin.settings.NativeSuggestOptions;
26import com.android.inputmethod.latin.utils.CollectionUtils;
27import com.android.inputmethod.latin.utils.JniUtils;
28import com.android.inputmethod.latin.utils.StringUtils;
29import com.android.inputmethod.latin.utils.UnigramProperty;
30
31import java.io.File;
32import java.util.ArrayList;
33import java.util.Arrays;
34import java.util.Locale;
35import java.util.Map;
36
37/**
38 * Implements a static, compacted, binary dictionary of standard words.
39 */
40// TODO: All methods which should be locked need to have a suffix "Locked".
41public final class BinaryDictionary extends Dictionary {
42    private static final String TAG = BinaryDictionary.class.getSimpleName();
43
44    // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h
45    private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
46    // Must be equal to MAX_RESULTS in native/jni/src/defines.h
47    private static final int MAX_RESULTS = 18;
48    // The cutoff returned by native for auto-commit confidence.
49    // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h
50    private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000;
51
52    @UsedForTesting
53    public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
54    @UsedForTesting
55    public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
56    @UsedForTesting
57    public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
58    @UsedForTesting
59    public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
60
61    public static final int NOT_A_VALID_TIMESTAMP = -1;
62
63    // Format to get unigram flags from native side via getUnigramPropertyNative().
64    private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT = 4;
65    private static final int FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
66    private static final int FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX = 1;
67    private static final int FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX = 2;
68    private static final int FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
69
70    // Format to get unigram historical info from native side via getUnigramPropertyNative().
71    private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3;
72    private static final int FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX = 0;
73    private static final int FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX = 1;
74    private static final int FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX = 2;
75
76    private long mNativeDict;
77    private final Locale mLocale;
78    private final long mDictSize;
79    private final String mDictFilePath;
80    private final int[] mInputCodePoints = new int[MAX_WORD_LENGTH];
81    private final int[] mOutputCodePoints = new int[MAX_WORD_LENGTH * MAX_RESULTS];
82    private final int[] mSpaceIndices = new int[MAX_RESULTS];
83    private final int[] mOutputScores = new int[MAX_RESULTS];
84    private final int[] mOutputTypes = new int[MAX_RESULTS];
85    // Only one result is ever used
86    private final int[] mOutputAutoCommitFirstWordConfidence = new int[1];
87
88    private final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions();
89
90    private final SparseArray<DicTraverseSession> mDicTraverseSessions =
91            CollectionUtils.newSparseArray();
92
93    // TODO: There should be a way to remove used DicTraverseSession objects from
94    // {@code mDicTraverseSessions}.
95    private DicTraverseSession getTraverseSession(final int traverseSessionId) {
96        synchronized(mDicTraverseSessions) {
97            DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId);
98            if (traverseSession == null) {
99                traverseSession = mDicTraverseSessions.get(traverseSessionId);
100                if (traverseSession == null) {
101                    traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize);
102                    mDicTraverseSessions.put(traverseSessionId, traverseSession);
103                }
104            }
105            return traverseSession;
106        }
107    }
108
109    /**
110     * Constructor for the binary dictionary. This is supposed to be called from the
111     * dictionary factory.
112     * @param filename the name of the file to read through native code.
113     * @param offset the offset of the dictionary data within the file.
114     * @param length the length of the binary data.
115     * @param useFullEditDistance whether to use the full edit distance in suggestions
116     * @param dictType the dictionary type, as a human-readable string
117     * @param isUpdatable whether to open the dictionary file in writable mode.
118     */
119    public BinaryDictionary(final String filename, final long offset, final long length,
120            final boolean useFullEditDistance, final Locale locale, final String dictType,
121            final boolean isUpdatable) {
122        super(dictType);
123        mLocale = locale;
124        mDictSize = length;
125        mDictFilePath = filename;
126        mNativeSuggestOptions.setUseFullEditDistance(useFullEditDistance);
127        loadDictionary(filename, offset, length, isUpdatable);
128    }
129
130    static {
131        JniUtils.loadNativeLibrary();
132    }
133
134    private static native boolean createEmptyDictFileNative(String filePath, long dictVersion,
135            String[] attributeKeyStringArray, String[] attributeValueStringArray);
136    private static native long openNative(String sourceDir, long dictOffset, long dictSize,
137            boolean isUpdatable);
138    private static native void flushNative(long dict, String filePath);
139    private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC);
140    private static native void flushWithGCNative(long dict, String filePath);
141    private static native void closeNative(long dict);
142    private static native int getFormatVersionNative(long dict);
143    private static native int getProbabilityNative(long dict, int[] word);
144    private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1);
145    private static native void getUnigramPropertyNative(long dict, int[] word,
146            int[] outCodePoints, boolean[] outFlags, int[] outProbability,
147            int[] outHistoricalInfo, ArrayList<int[]> outShortcutTargets,
148            ArrayList<Integer> outShortcutProbabilities);
149    private static native int getSuggestionsNative(long dict, long proximityInfo,
150            long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
151            int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint,
152            int[] suggestOptions, int[] prevWordCodePointArray,
153            int[] outputCodePoints, int[] outputScores, int[] outputIndices, int[] outputTypes,
154            int[] outputAutoCommitFirstWordConfidence);
155    private static native float calcNormalizedScoreNative(int[] before, int[] after, int score);
156    private static native int editDistanceNative(int[] before, int[] after);
157    private static native void addUnigramWordNative(long dict, int[] word, int probability,
158            int[] shortcutTarget, int shortcutProbability, boolean isNotAWord,
159            boolean isBlacklisted, int timestamp);
160    private static native void addBigramWordsNative(long dict, int[] word0, int[] word1,
161            int probability, int timestamp);
162    private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
163    private static native int addMultipleDictionaryEntriesNative(long dict,
164            LanguageModelParam[] languageModelParams, int startIndex);
165    private static native int calculateProbabilityNative(long dict, int unigramProbability,
166            int bigramProbability);
167    private static native String getPropertyNative(long dict, String query);
168
169    @UsedForTesting
170    public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
171            final Map<String, String> attributeMap) {
172        final String[] keyArray = new String[attributeMap.size()];
173        final String[] valueArray = new String[attributeMap.size()];
174        int index = 0;
175        for (final String key : attributeMap.keySet()) {
176            keyArray[index] = key;
177            valueArray[index] = attributeMap.get(key);
178            index++;
179        }
180        return createEmptyDictFileNative(filePath, dictVersion, keyArray, valueArray);
181    }
182
183    // TODO: Move native dict into session
184    private final void loadDictionary(final String path, final long startOffset,
185            final long length, final boolean isUpdatable) {
186        mNativeDict = openNative(path, startOffset, length, isUpdatable);
187    }
188
189    @Override
190    public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer,
191            final String prevWord, final ProximityInfo proximityInfo,
192            final boolean blockOffensiveWords, final int[] additionalFeaturesOptions) {
193        return getSuggestionsWithSessionId(composer, prevWord, proximityInfo, blockOffensiveWords,
194                additionalFeaturesOptions, 0 /* sessionId */);
195    }
196
197    @Override
198    public ArrayList<SuggestedWordInfo> getSuggestionsWithSessionId(final WordComposer composer,
199            final String prevWord, final ProximityInfo proximityInfo,
200            final boolean blockOffensiveWords, final int[] additionalFeaturesOptions,
201            final int sessionId) {
202        if (!isValidDictionary()) return null;
203
204        Arrays.fill(mInputCodePoints, Constants.NOT_A_CODE);
205        // TODO: toLowerCase in the native code
206        final int[] prevWordCodePointArray = (null == prevWord)
207                ? null : StringUtils.toCodePointArray(prevWord);
208        final int composerSize = composer.size();
209
210        final boolean isGesture = composer.isBatchMode();
211        if (composerSize <= 1 || !isGesture) {
212            if (composerSize > MAX_WORD_LENGTH - 1) return null;
213            for (int i = 0; i < composerSize; i++) {
214                mInputCodePoints[i] = composer.getCodeAt(i);
215            }
216        }
217
218        final InputPointers ips = composer.getInputPointers();
219        final int inputSize = isGesture ? ips.getPointerSize() : composerSize;
220        mNativeSuggestOptions.setIsGesture(isGesture);
221        mNativeSuggestOptions.setAdditionalFeaturesOptions(additionalFeaturesOptions);
222        // proximityInfo and/or prevWordForBigrams may not be null.
223        final int count = getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(),
224                getTraverseSession(sessionId).getSession(), ips.getXCoordinates(),
225                ips.getYCoordinates(), ips.getTimes(), ips.getPointerIds(), mInputCodePoints,
226                inputSize, 0 /* commitPoint */, mNativeSuggestOptions.getOptions(),
227                prevWordCodePointArray, mOutputCodePoints, mOutputScores, mSpaceIndices,
228                mOutputTypes, mOutputAutoCommitFirstWordConfidence);
229        final ArrayList<SuggestedWordInfo> suggestions = CollectionUtils.newArrayList();
230        for (int j = 0; j < count; ++j) {
231            final int start = j * MAX_WORD_LENGTH;
232            int len = 0;
233            while (len < MAX_WORD_LENGTH && mOutputCodePoints[start + len] != 0) {
234                ++len;
235            }
236            if (len > 0) {
237                final int flags = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_FLAGS;
238                if (blockOffensiveWords
239                        && 0 != (flags & SuggestedWordInfo.KIND_FLAG_POSSIBLY_OFFENSIVE)
240                        && 0 == (flags & SuggestedWordInfo.KIND_FLAG_EXACT_MATCH)) {
241                    // If we block potentially offensive words, and if the word is possibly
242                    // offensive, then we don't output it unless it's also an exact match.
243                    continue;
244                }
245                final int kind = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_KIND;
246                final int score = SuggestedWordInfo.KIND_WHITELIST == kind
247                        ? SuggestedWordInfo.MAX_SCORE : mOutputScores[j];
248                // TODO: check that all users of the `kind' parameter are ready to accept
249                // flags too and pass mOutputTypes[j] instead of kind
250                suggestions.add(new SuggestedWordInfo(new String(mOutputCodePoints, start, len),
251                        score, kind, this /* sourceDict */,
252                        mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */,
253                        mOutputAutoCommitFirstWordConfidence[0]));
254            }
255        }
256        return suggestions;
257    }
258
259    public boolean isValidDictionary() {
260        return mNativeDict != 0;
261    }
262
263    public int getFormatVersion() {
264        return getFormatVersionNative(mNativeDict);
265    }
266
267    public static float calcNormalizedScore(final String before, final String after,
268            final int score) {
269        return calcNormalizedScoreNative(StringUtils.toCodePointArray(before),
270                StringUtils.toCodePointArray(after), score);
271    }
272
273    public static int editDistance(final String before, final String after) {
274        if (before == null || after == null) {
275            throw new IllegalArgumentException();
276        }
277        return editDistanceNative(StringUtils.toCodePointArray(before),
278                StringUtils.toCodePointArray(after));
279    }
280
281    @Override
282    public boolean isValidWord(final String word) {
283        return getFrequency(word) != NOT_A_PROBABILITY;
284    }
285
286    @Override
287    public int getFrequency(final String word) {
288        if (word == null) return NOT_A_PROBABILITY;
289        int[] codePoints = StringUtils.toCodePointArray(word);
290        return getProbabilityNative(mNativeDict, codePoints);
291    }
292
293    // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
294    // calls when checking for changes in an entire dictionary.
295    public boolean isValidBigram(final String word0, final String word1) {
296        return getBigramProbability(word0, word1) != NOT_A_PROBABILITY;
297    }
298
299    public int getBigramProbability(final String word0, final String word1) {
300        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) return NOT_A_PROBABILITY;
301        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
302        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
303        return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1);
304    }
305
306    @UsedForTesting
307    public UnigramProperty getUnigramProperty(final String word) {
308        if (TextUtils.isEmpty(word)) {
309            return null;
310        }
311        final int[] codePoints = StringUtils.toCodePointArray(word);
312        final int[] outCodePoints = new int[MAX_WORD_LENGTH];
313        final boolean[] outFlags = new boolean[FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT];
314        final int[] outProbability = new int[1];
315        final int[] outHistoricalInfo =
316                new int[FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT];
317        final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList();
318        final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList();
319        getUnigramPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability,
320                outHistoricalInfo, outShortcutTargets, outShortcutProbabilities);
321        return new UnigramProperty(codePoints,
322                outFlags[FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX],
323                outFlags[FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX],
324                outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX],
325                outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0],
326                outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX],
327                outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX],
328                outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX],
329                outShortcutTargets, outShortcutProbabilities);
330    }
331
332    // Add a unigram entry to binary dictionary with unigram attributes in native code.
333    public void addUnigramWord(final String word, final int probability,
334            final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord,
335            final boolean isBlacklisted, final int timestamp) {
336        if (TextUtils.isEmpty(word)) {
337            return;
338        }
339        final int[] codePoints = StringUtils.toCodePointArray(word);
340        final int[] shortcutTargetCodePoints = (shortcutTarget != null) ?
341                StringUtils.toCodePointArray(shortcutTarget) : null;
342        addUnigramWordNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints,
343                shortcutProbability, isNotAWord, isBlacklisted, timestamp);
344    }
345
346    // Add a bigram entry to binary dictionary with timestamp in native code.
347    public void addBigramWords(final String word0, final String word1, final int probability,
348            final int timestamp) {
349        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) {
350            return;
351        }
352        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
353        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
354        addBigramWordsNative(mNativeDict, codePoints0, codePoints1, probability, timestamp);
355    }
356
357    // Remove a bigram entry form binary dictionary in native code.
358    public void removeBigramWords(final String word0, final String word1) {
359        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) {
360            return;
361        }
362        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
363        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
364        removeBigramWordsNative(mNativeDict, codePoints0, codePoints1);
365    }
366
367    public static class LanguageModelParam {
368        public final int[] mWord0;
369        public final int[] mWord1;
370        public final int[] mShortcutTarget;
371        public final int mUnigramProbability;
372        public final int mBigramProbability;
373        public final int mShortcutProbability;
374        public final boolean mIsNotAWord;
375        public final boolean mIsBlacklisted;
376        public final int mTimestamp;
377
378        // Constructor for unigram.
379        public LanguageModelParam(final String word, final int unigramProbability,
380                final int timestamp) {
381            mWord0 = null;
382            mWord1 = StringUtils.toCodePointArray(word);
383            mShortcutTarget = null;
384            mUnigramProbability = unigramProbability;
385            mBigramProbability = NOT_A_PROBABILITY;
386            mShortcutProbability = NOT_A_PROBABILITY;
387            mIsNotAWord = false;
388            mIsBlacklisted = false;
389            mTimestamp = timestamp;
390        }
391
392        // Constructor for unigram and bigram.
393        public LanguageModelParam(final String word0, final String word1,
394                final int unigramProbability, final int bigramProbability,
395                final int timestamp) {
396            mWord0 = StringUtils.toCodePointArray(word0);
397            mWord1 = StringUtils.toCodePointArray(word1);
398            mShortcutTarget = null;
399            mUnigramProbability = unigramProbability;
400            mBigramProbability = bigramProbability;
401            mShortcutProbability = NOT_A_PROBABILITY;
402            mIsNotAWord = false;
403            mIsBlacklisted = false;
404            mTimestamp = timestamp;
405        }
406    }
407
408    public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) {
409        if (!isValidDictionary()) return;
410        int processedParamCount = 0;
411        while (processedParamCount < languageModelParams.length) {
412            if (needsToRunGC(true /* mindsBlockByGC */)) {
413                flushWithGC();
414            }
415            processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict,
416                    languageModelParams, processedParamCount);
417            if (processedParamCount <= 0) {
418                return;
419            }
420        }
421
422    }
423
424    private void reopen() {
425        close();
426        final File dictFile = new File(mDictFilePath);
427        // WARNING: Because we pass 0 as the offset and file.length() as the length, this can
428        // only be called for actual files. Right now it's only called by the flush() family of
429        // functions, which require an updatable dictionary, so it's okay. But beware.
430        loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */,
431                dictFile.length(), true /* isUpdatable */);
432    }
433
434    public void flush() {
435        if (!isValidDictionary()) return;
436        flushNative(mNativeDict, mDictFilePath);
437        reopen();
438    }
439
440    public void flushWithGC() {
441        if (!isValidDictionary()) return;
442        flushWithGCNative(mNativeDict, mDictFilePath);
443        reopen();
444    }
445
446    /**
447     * Checks whether GC is needed to run or not.
448     * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about
449     * the blocking in some situations such as in idle time or just before closing.
450     * @return whether GC is needed to run or not.
451     */
452    public boolean needsToRunGC(final boolean mindsBlockByGC) {
453        if (!isValidDictionary()) return false;
454        return needsToRunGCNative(mNativeDict, mindsBlockByGC);
455    }
456
457    @UsedForTesting
458    public int calculateProbability(final int unigramProbability, final int bigramProbability) {
459        if (!isValidDictionary()) return NOT_A_PROBABILITY;
460        return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
461    }
462
463    @UsedForTesting
464    public String getPropertyForTests(String query) {
465        if (!isValidDictionary()) return "";
466        return getPropertyNative(mNativeDict, query);
467    }
468
469    @Override
470    public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
471        return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT;
472    }
473
474    @Override
475    public void close() {
476        synchronized (mDicTraverseSessions) {
477            final int sessionsSize = mDicTraverseSessions.size();
478            for (int index = 0; index < sessionsSize; ++index) {
479                final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index);
480                if (traverseSession != null) {
481                    traverseSession.close();
482                }
483            }
484            mDicTraverseSessions.clear();
485        }
486        closeInternalLocked();
487    }
488
489    private synchronized void closeInternalLocked() {
490        if (mNativeDict != 0) {
491            closeNative(mNativeDict);
492            mNativeDict = 0;
493        }
494    }
495
496    // TODO: Manage BinaryDictionary instances without using WeakReference or something.
497    @Override
498    protected void finalize() throws Throwable {
499        try {
500            closeInternalLocked();
501        } finally {
502            super.finalize();
503        }
504    }
505}
506