BinaryDictionary.java revision a245d15da5d295af21ead9a01583c64796a31ad7
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.text.TextUtils;
20import android.util.SparseArray;
21
22import com.android.inputmethod.annotations.UsedForTesting;
23import com.android.inputmethod.keyboard.ProximityInfo;
24import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
25import com.android.inputmethod.latin.settings.NativeSuggestOptions;
26import com.android.inputmethod.latin.utils.CollectionUtils;
27import com.android.inputmethod.latin.utils.JniUtils;
28import com.android.inputmethod.latin.utils.StringUtils;
29import com.android.inputmethod.latin.utils.UnigramProperty;
30
31import java.io.File;
32import java.util.ArrayList;
33import java.util.Arrays;
34import java.util.Locale;
35import java.util.Map;
36
37/**
38 * Implements a static, compacted, binary dictionary of standard words.
39 */
40// TODO: All methods which should be locked need to have a suffix "Locked".
41public final class BinaryDictionary extends Dictionary {
42    private static final String TAG = BinaryDictionary.class.getSimpleName();
43
44    // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h
45    private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
46    // Must be equal to MAX_RESULTS in native/jni/src/defines.h
47    private static final int MAX_RESULTS = 18;
48    // The cutoff returned by native for auto-commit confidence.
49    // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h
50    private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000;
51
52    @UsedForTesting
53    public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
54    @UsedForTesting
55    public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
56    @UsedForTesting
57    public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
58    @UsedForTesting
59    public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
60
61    public static final int NOT_A_VALID_TIMESTAMP = -1;
62
63    // Format to get unigram flags from native side via getUnigramPropertyNative().
64    private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT = 4;
65    private static final int FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
66    private static final int FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX = 1;
67    private static final int FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX = 2;
68    private static final int FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
69
70    // Format to get unigram historical info from native side via getUnigramPropertyNative().
71    private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3;
72    private static final int FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX = 0;
73    private static final int FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX = 1;
74    private static final int FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX = 2;
75
76    private long mNativeDict;
77    private final Locale mLocale;
78    private final long mDictSize;
79    private final String mDictFilePath;
80    private final int[] mInputCodePoints = new int[MAX_WORD_LENGTH];
81    private final int[] mOutputCodePoints = new int[MAX_WORD_LENGTH * MAX_RESULTS];
82    private final int[] mSpaceIndices = new int[MAX_RESULTS];
83    private final int[] mOutputScores = new int[MAX_RESULTS];
84    private final int[] mOutputTypes = new int[MAX_RESULTS];
85    // Only one result is ever used
86    private final int[] mOutputAutoCommitFirstWordConfidence = new int[1];
87
88    private final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions();
89
90    private final SparseArray<DicTraverseSession> mDicTraverseSessions =
91            CollectionUtils.newSparseArray();
92
93    // TODO: There should be a way to remove used DicTraverseSession objects from
94    // {@code mDicTraverseSessions}.
95    private DicTraverseSession getTraverseSession(final int traverseSessionId) {
96        synchronized(mDicTraverseSessions) {
97            DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId);
98            if (traverseSession == null) {
99                traverseSession = mDicTraverseSessions.get(traverseSessionId);
100                if (traverseSession == null) {
101                    traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize);
102                    mDicTraverseSessions.put(traverseSessionId, traverseSession);
103                }
104            }
105            return traverseSession;
106        }
107    }
108
109    /**
110     * Constructor for the binary dictionary. This is supposed to be called from the
111     * dictionary factory.
112     * @param filename the name of the file to read through native code.
113     * @param offset the offset of the dictionary data within the file.
114     * @param length the length of the binary data.
115     * @param useFullEditDistance whether to use the full edit distance in suggestions
116     * @param dictType the dictionary type, as a human-readable string
117     * @param isUpdatable whether to open the dictionary file in writable mode.
118     */
119    public BinaryDictionary(final String filename, final long offset, final long length,
120            final boolean useFullEditDistance, final Locale locale, final String dictType,
121            final boolean isUpdatable) {
122        super(dictType);
123        mLocale = locale;
124        mDictSize = length;
125        mDictFilePath = filename;
126        mNativeSuggestOptions.setUseFullEditDistance(useFullEditDistance);
127        loadDictionary(filename, offset, length, isUpdatable);
128    }
129
130    static {
131        JniUtils.loadNativeLibrary();
132    }
133
134    private static native boolean createEmptyDictFileNative(String filePath, long dictVersion,
135            String[] attributeKeyStringArray, String[] attributeValueStringArray);
136    private static native long openNative(String sourceDir, long dictOffset, long dictSize,
137            boolean isUpdatable);
138    private static native void flushNative(long dict, String filePath);
139    private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC);
140    private static native void flushWithGCNative(long dict, String filePath);
141    private static native void closeNative(long dict);
142    private static native int getFormatVersionNative(long dict);
143    private static native int getProbabilityNative(long dict, int[] word);
144    private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1);
145    private static native void getUnigramPropertyNative(long dict, int[] word,
146            int[] outCodePoints, boolean[] outFlags, int[] outProbability,
147            int[] outHistoricalInfo, ArrayList<int[]> outShortcutTargets,
148            ArrayList<Integer> outShortcutProbabilities);
149    private static native int getSuggestionsNative(long dict, long proximityInfo,
150            long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
151            int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint,
152            int[] suggestOptions, int[] prevWordCodePointArray,
153            int[] outputCodePoints, int[] outputScores, int[] outputIndices, int[] outputTypes,
154            int[] outputAutoCommitFirstWordConfidence);
155    private static native float calcNormalizedScoreNative(int[] before, int[] after, int score);
156    private static native int editDistanceNative(int[] before, int[] after);
157    private static native void addUnigramWordNative(long dict, int[] word, int probability,
158            int[] shortcutTarget, int shortcutProbability, boolean isNotAWord,
159            boolean isBlacklisted, int timestamp);
160    private static native void addBigramWordsNative(long dict, int[] word0, int[] word1,
161            int probability, int timestamp);
162    private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
163    private static native int addMultipleDictionaryEntriesNative(long dict,
164            LanguageModelParam[] languageModelParams, int startIndex);
165    private static native int calculateProbabilityNative(long dict, int unigramProbability,
166            int bigramProbability);
167    private static native String getPropertyNative(long dict, String query);
168
169    @UsedForTesting
170    public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
171            final Map<String, String> attributeMap) {
172        final String[] keyArray = new String[attributeMap.size()];
173        final String[] valueArray = new String[attributeMap.size()];
174        int index = 0;
175        for (final String key : attributeMap.keySet()) {
176            keyArray[index] = key;
177            valueArray[index] = attributeMap.get(key);
178            index++;
179        }
180        return createEmptyDictFileNative(filePath, dictVersion, keyArray, valueArray);
181    }
182
183    // TODO: Move native dict into session
184    private final void loadDictionary(final String path, final long startOffset,
185            final long length, final boolean isUpdatable) {
186        mNativeDict = openNative(path, startOffset, length, isUpdatable);
187    }
188
189    @Override
190    public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer,
191            final String prevWord, final ProximityInfo proximityInfo,
192            final boolean blockOffensiveWords, final int[] additionalFeaturesOptions) {
193        return getSuggestionsWithSessionId(composer, prevWord, proximityInfo, blockOffensiveWords,
194                additionalFeaturesOptions, 0 /* sessionId */);
195    }
196
197    @Override
198    public ArrayList<SuggestedWordInfo> getSuggestionsWithSessionId(final WordComposer composer,
199            final String prevWord, final ProximityInfo proximityInfo,
200            final boolean blockOffensiveWords, final int[] additionalFeaturesOptions,
201            final int sessionId) {
202        if (!isValidDictionary()) return null;
203
204        Arrays.fill(mInputCodePoints, Constants.NOT_A_CODE);
205        // TODO: toLowerCase in the native code
206        final int[] prevWordCodePointArray = (null == prevWord)
207                ? null : StringUtils.toCodePointArray(prevWord);
208        final int composerSize = composer.size();
209
210        final boolean isGesture = composer.isBatchMode();
211        if (composerSize <= 1 || !isGesture) {
212            if (composerSize > MAX_WORD_LENGTH - 1) return null;
213            for (int i = 0; i < composerSize; i++) {
214                mInputCodePoints[i] = composer.getCodeAt(i);
215            }
216        }
217
218        final InputPointers ips = composer.getInputPointers();
219        final int inputSize = isGesture ? ips.getPointerSize() : composerSize;
220        mNativeSuggestOptions.setIsGesture(isGesture);
221        mNativeSuggestOptions.setAdditionalFeaturesOptions(additionalFeaturesOptions);
222        // proximityInfo and/or prevWordForBigrams may not be null.
223        final int count = getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(),
224                getTraverseSession(sessionId).getSession(), ips.getXCoordinates(),
225                ips.getYCoordinates(), ips.getTimes(), ips.getPointerIds(), mInputCodePoints,
226                inputSize, 0 /* commitPoint */, mNativeSuggestOptions.getOptions(),
227                prevWordCodePointArray, mOutputCodePoints, mOutputScores, mSpaceIndices,
228                mOutputTypes, mOutputAutoCommitFirstWordConfidence);
229        final ArrayList<SuggestedWordInfo> suggestions = CollectionUtils.newArrayList();
230        for (int j = 0; j < count; ++j) {
231            final int start = j * MAX_WORD_LENGTH;
232            int len = 0;
233            while (len < MAX_WORD_LENGTH && mOutputCodePoints[start + len] != 0) {
234                ++len;
235            }
236            if (len > 0) {
237                final int flags = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_FLAGS;
238                if (blockOffensiveWords
239                        && 0 != (flags & SuggestedWordInfo.KIND_FLAG_POSSIBLY_OFFENSIVE)
240                        && 0 == (flags & SuggestedWordInfo.KIND_FLAG_EXACT_MATCH)) {
241                    // If we block potentially offensive words, and if the word is possibly
242                    // offensive, then we don't output it unless it's also an exact match.
243                    continue;
244                }
245                final int kind = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_KIND;
246                final int score = SuggestedWordInfo.KIND_WHITELIST == kind
247                        ? SuggestedWordInfo.MAX_SCORE : mOutputScores[j];
248                // TODO: check that all users of the `kind' parameter are ready to accept
249                // flags too and pass mOutputTypes[j] instead of kind
250                suggestions.add(new SuggestedWordInfo(new String(mOutputCodePoints, start, len),
251                        score, kind, this /* sourceDict */,
252                        mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */,
253                        mOutputAutoCommitFirstWordConfidence[0]));
254            }
255        }
256        return suggestions;
257    }
258
259    public boolean isValidDictionary() {
260        return mNativeDict != 0;
261    }
262
263    public int getFormatVersion() {
264        return getFormatVersionNative(mNativeDict);
265    }
266
267    public static float calcNormalizedScore(final String before, final String after,
268            final int score) {
269        return calcNormalizedScoreNative(StringUtils.toCodePointArray(before),
270                StringUtils.toCodePointArray(after), score);
271    }
272
273    public static int editDistance(final String before, final String after) {
274        if (before == null || after == null) {
275            throw new IllegalArgumentException();
276        }
277        return editDistanceNative(StringUtils.toCodePointArray(before),
278                StringUtils.toCodePointArray(after));
279    }
280
281    @Override
282    public boolean isValidWord(final String word) {
283        return getFrequency(word) != NOT_A_PROBABILITY;
284    }
285
286    @Override
287    public int getFrequency(final String word) {
288        if (word == null) return NOT_A_PROBABILITY;
289        int[] codePoints = StringUtils.toCodePointArray(word);
290        return getProbabilityNative(mNativeDict, codePoints);
291    }
292
293    // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
294    // calls when checking for changes in an entire dictionary.
295    public boolean isValidBigram(final String word0, final String word1) {
296        return getBigramProbability(word0, word1) != NOT_A_PROBABILITY;
297    }
298
299    public int getBigramProbability(final String word0, final String word1) {
300        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) return NOT_A_PROBABILITY;
301        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
302        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
303        return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1);
304    }
305
306    @UsedForTesting
307    public UnigramProperty getUnigramProperty(final String word) {
308        if (TextUtils.isEmpty(word)) {
309            return null;
310        }
311        final int[] codePoints = StringUtils.toCodePointArray(word);
312        final int[] outCodePoints = new int[MAX_WORD_LENGTH];
313        final boolean[] outFlags = new boolean[FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT];
314        final int[] outProbability = new int[1];
315        final int[] outHistoricalInfo =
316                new int[FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT];
317        final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList();
318        final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList();
319        getUnigramPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability,
320                outHistoricalInfo, outShortcutTargets, outShortcutProbabilities);
321        return new UnigramProperty(codePoints,
322                outFlags[FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX],
323                outFlags[FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX],
324                outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX],
325                outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0],
326                outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX],
327                outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX],
328                outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX],
329                outShortcutTargets, outShortcutProbabilities);
330    }
331
332    // Add a unigram entry to binary dictionary with unigram attributes in native code.
333    public void addUnigramWord(final String word, final int probability,
334            final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord,
335            final boolean isBlacklisted, final int timestamp) {
336        if (TextUtils.isEmpty(word)) {
337            return;
338        }
339        final int[] codePoints = StringUtils.toCodePointArray(word);
340        final int[] shortcutTargetCodePoints = (shortcutTarget != null) ?
341                StringUtils.toCodePointArray(shortcutTarget) : null;
342        addUnigramWordNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints,
343                shortcutProbability, isNotAWord, isBlacklisted, timestamp);
344    }
345
346    // Add a bigram entry to binary dictionary with timestamp in native code.
347    public void addBigramWords(final String word0, final String word1, final int probability,
348            final int timestamp) {
349        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) {
350            return;
351        }
352        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
353        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
354        addBigramWordsNative(mNativeDict, codePoints0, codePoints1, probability, timestamp);
355    }
356
357    // Remove a bigram entry form binary dictionary in native code.
358    public void removeBigramWords(final String word0, final String word1) {
359        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) {
360            return;
361        }
362        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
363        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
364        removeBigramWordsNative(mNativeDict, codePoints0, codePoints1);
365    }
366
367    public static class LanguageModelParam {
368        public final int[] mWord0;
369        public final int[] mWord1;
370        // TODO: this needs to be a list of shortcuts
371        public final int[] mShortcutTarget;
372        public final int mUnigramProbability;
373        public final int mBigramProbability;
374        public final int mShortcutProbability;
375        public final boolean mIsNotAWord;
376        public final boolean mIsBlacklisted;
377        public final int mTimestamp;
378
379        // Constructor for unigram. TODO: support shortcuts
380        public LanguageModelParam(final String word, final int unigramProbability,
381                final int timestamp) {
382            mWord0 = null;
383            mWord1 = StringUtils.toCodePointArray(word);
384            mShortcutTarget = null;
385            mUnigramProbability = unigramProbability;
386            mBigramProbability = NOT_A_PROBABILITY;
387            mShortcutProbability = NOT_A_PROBABILITY;
388            mIsNotAWord = false;
389            mIsBlacklisted = false;
390            mTimestamp = timestamp;
391        }
392
393        // Constructor for unigram and bigram.
394        public LanguageModelParam(final String word0, final String word1,
395                final int unigramProbability, final int bigramProbability,
396                final int timestamp) {
397            mWord0 = StringUtils.toCodePointArray(word0);
398            mWord1 = StringUtils.toCodePointArray(word1);
399            mShortcutTarget = null;
400            mUnigramProbability = unigramProbability;
401            mBigramProbability = bigramProbability;
402            mShortcutProbability = NOT_A_PROBABILITY;
403            mIsNotAWord = false;
404            mIsBlacklisted = false;
405            mTimestamp = timestamp;
406        }
407    }
408
409    public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) {
410        if (!isValidDictionary()) return;
411        int processedParamCount = 0;
412        while (processedParamCount < languageModelParams.length) {
413            if (needsToRunGC(true /* mindsBlockByGC */)) {
414                flushWithGC();
415            }
416            processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict,
417                    languageModelParams, processedParamCount);
418            if (processedParamCount <= 0) {
419                return;
420            }
421        }
422
423    }
424
425    private void reopen() {
426        close();
427        final File dictFile = new File(mDictFilePath);
428        // WARNING: Because we pass 0 as the offset and file.length() as the length, this can
429        // only be called for actual files. Right now it's only called by the flush() family of
430        // functions, which require an updatable dictionary, so it's okay. But beware.
431        loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */,
432                dictFile.length(), true /* isUpdatable */);
433    }
434
435    public void flush() {
436        if (!isValidDictionary()) return;
437        flushNative(mNativeDict, mDictFilePath);
438        reopen();
439    }
440
441    public void flushWithGC() {
442        if (!isValidDictionary()) return;
443        flushWithGCNative(mNativeDict, mDictFilePath);
444        reopen();
445    }
446
447    /**
448     * Checks whether GC is needed to run or not.
449     * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about
450     * the blocking in some situations such as in idle time or just before closing.
451     * @return whether GC is needed to run or not.
452     */
453    public boolean needsToRunGC(final boolean mindsBlockByGC) {
454        if (!isValidDictionary()) return false;
455        return needsToRunGCNative(mNativeDict, mindsBlockByGC);
456    }
457
458    @UsedForTesting
459    public int calculateProbability(final int unigramProbability, final int bigramProbability) {
460        if (!isValidDictionary()) return NOT_A_PROBABILITY;
461        return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
462    }
463
464    @UsedForTesting
465    public String getPropertyForTests(String query) {
466        if (!isValidDictionary()) return "";
467        return getPropertyNative(mNativeDict, query);
468    }
469
470    @Override
471    public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
472        return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT;
473    }
474
475    @Override
476    public void close() {
477        synchronized (mDicTraverseSessions) {
478            final int sessionsSize = mDicTraverseSessions.size();
479            for (int index = 0; index < sessionsSize; ++index) {
480                final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index);
481                if (traverseSession != null) {
482                    traverseSession.close();
483                }
484            }
485            mDicTraverseSessions.clear();
486        }
487        closeInternalLocked();
488    }
489
490    private synchronized void closeInternalLocked() {
491        if (mNativeDict != 0) {
492            closeNative(mNativeDict);
493            mNativeDict = 0;
494        }
495    }
496
497    // TODO: Manage BinaryDictionary instances without using WeakReference or something.
498    @Override
499    protected void finalize() throws Throwable {
500        try {
501            closeInternalLocked();
502        } finally {
503            super.finalize();
504        }
505    }
506}
507