BinaryDictionary.java revision 38f341a2a53a04ce4195a0cb99fcb6e71203dec0
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.text.TextUtils;
20import android.util.SparseArray;
21
22import com.android.inputmethod.annotations.UsedForTesting;
23import com.android.inputmethod.keyboard.ProximityInfo;
24import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
25import com.android.inputmethod.latin.makedict.Word;
26import com.android.inputmethod.latin.settings.NativeSuggestOptions;
27import com.android.inputmethod.latin.utils.CollectionUtils;
28import com.android.inputmethod.latin.utils.JniUtils;
29import com.android.inputmethod.latin.utils.LanguageModelParam;
30import com.android.inputmethod.latin.utils.StringUtils;
31import com.android.inputmethod.latin.utils.WordProperty;
32
33import java.io.File;
34import java.util.ArrayList;
35import java.util.Arrays;
36import java.util.Locale;
37import java.util.Map;
38
39/**
40 * Implements a static, compacted, binary dictionary of standard words.
41 */
42// TODO: All methods which should be locked need to have a suffix "Locked".
43public final class BinaryDictionary extends Dictionary {
44    private static final String TAG = BinaryDictionary.class.getSimpleName();
45
46    // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h
47    private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
48    // Must be equal to MAX_RESULTS in native/jni/src/defines.h
49    private static final int MAX_RESULTS = 18;
50    // The cutoff returned by native for auto-commit confidence.
51    // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h
52    private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000;
53
54    @UsedForTesting
55    public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
56    @UsedForTesting
57    public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
58    @UsedForTesting
59    public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
60    @UsedForTesting
61    public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
62
63    public static final int NOT_A_VALID_TIMESTAMP = -1;
64
65    // Format to get unigram flags from native side via getWordPropertyNative().
66    private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 4;
67    private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
68    private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1;
69    private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2;
70    private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
71
72    // Format to get probability and historical info from native side via getWordPropertyNative().
73    public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4;
74    public static final int FORMAT_WORD_PROPERTY_PROBABILITY_INDEX = 0;
75    public static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 1;
76    public static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 2;
77    public static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 3;
78
79    private long mNativeDict;
80    private final Locale mLocale;
81    private final long mDictSize;
82    private final String mDictFilePath;
83    private final int[] mInputCodePoints = new int[MAX_WORD_LENGTH];
84    private final int[] mOutputCodePoints = new int[MAX_WORD_LENGTH * MAX_RESULTS];
85    private final int[] mSpaceIndices = new int[MAX_RESULTS];
86    private final int[] mOutputScores = new int[MAX_RESULTS];
87    private final int[] mOutputTypes = new int[MAX_RESULTS];
88    // Only one result is ever used
89    private final int[] mOutputAutoCommitFirstWordConfidence = new int[1];
90
91    private final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions();
92
93    private final SparseArray<DicTraverseSession> mDicTraverseSessions =
94            CollectionUtils.newSparseArray();
95
96    // TODO: There should be a way to remove used DicTraverseSession objects from
97    // {@code mDicTraverseSessions}.
98    private DicTraverseSession getTraverseSession(final int traverseSessionId) {
99        synchronized(mDicTraverseSessions) {
100            DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId);
101            if (traverseSession == null) {
102                traverseSession = mDicTraverseSessions.get(traverseSessionId);
103                if (traverseSession == null) {
104                    traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize);
105                    mDicTraverseSessions.put(traverseSessionId, traverseSession);
106                }
107            }
108            return traverseSession;
109        }
110    }
111
112    /**
113     * Constructor for the binary dictionary. This is supposed to be called from the
114     * dictionary factory.
115     * @param filename the name of the file to read through native code.
116     * @param offset the offset of the dictionary data within the file.
117     * @param length the length of the binary data.
118     * @param useFullEditDistance whether to use the full edit distance in suggestions
119     * @param dictType the dictionary type, as a human-readable string
120     * @param isUpdatable whether to open the dictionary file in writable mode.
121     */
122    public BinaryDictionary(final String filename, final long offset, final long length,
123            final boolean useFullEditDistance, final Locale locale, final String dictType,
124            final boolean isUpdatable) {
125        super(dictType);
126        mLocale = locale;
127        mDictSize = length;
128        mDictFilePath = filename;
129        mNativeSuggestOptions.setUseFullEditDistance(useFullEditDistance);
130        loadDictionary(filename, offset, length, isUpdatable);
131    }
132
133    static {
134        JniUtils.loadNativeLibrary();
135    }
136
137    private static native boolean createEmptyDictFileNative(String filePath, long dictVersion,
138            String[] attributeKeyStringArray, String[] attributeValueStringArray);
139    private static native long openNative(String sourceDir, long dictOffset, long dictSize,
140            boolean isUpdatable);
141    private static native void flushNative(long dict, String filePath);
142    private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC);
143    private static native void flushWithGCNative(long dict, String filePath);
144    private static native void closeNative(long dict);
145    private static native int getFormatVersionNative(long dict);
146    private static native int getProbabilityNative(long dict, int[] word);
147    private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1);
148    private static native void getWordPropertyNative(long dict, int[] word,
149            int[] outCodePoints, boolean[] outFlags, int[] outProbabilityInfo,
150            ArrayList<int[]> outBigramTargets, ArrayList<int[]> outBigramProbabilityInfo,
151            ArrayList<int[]> outShortcutTargets, ArrayList<Integer> outShortcutProbabilities);
152    private static native int getNextWordNative(long dict, int token, int[] outCodePoints);
153    private static native int getSuggestionsNative(long dict, long proximityInfo,
154            long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
155            int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint,
156            int[] suggestOptions, int[] prevWordCodePointArray,
157            int[] outputCodePoints, int[] outputScores, int[] outputIndices, int[] outputTypes,
158            int[] outputAutoCommitFirstWordConfidence);
159    private static native float calcNormalizedScoreNative(int[] before, int[] after, int score);
160    private static native int editDistanceNative(int[] before, int[] after);
161    private static native void addUnigramWordNative(long dict, int[] word, int probability,
162            int[] shortcutTarget, int shortcutProbability, boolean isNotAWord,
163            boolean isBlacklisted, int timestamp);
164    private static native void addBigramWordsNative(long dict, int[] word0, int[] word1,
165            int probability, int timestamp);
166    private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
167    private static native int addMultipleDictionaryEntriesNative(long dict,
168            LanguageModelParam[] languageModelParams, int startIndex);
169    private static native int calculateProbabilityNative(long dict, int unigramProbability,
170            int bigramProbability);
171    private static native int setCurrentTimeForTestNative(int currentTime);
172    private static native String getPropertyNative(long dict, String query);
173
174    @UsedForTesting
175    public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
176            final Map<String, String> attributeMap) {
177        final String[] keyArray = new String[attributeMap.size()];
178        final String[] valueArray = new String[attributeMap.size()];
179        int index = 0;
180        for (final String key : attributeMap.keySet()) {
181            keyArray[index] = key;
182            valueArray[index] = attributeMap.get(key);
183            index++;
184        }
185        return createEmptyDictFileNative(filePath, dictVersion, keyArray, valueArray);
186    }
187
188    // TODO: Move native dict into session
189    private final void loadDictionary(final String path, final long startOffset,
190            final long length, final boolean isUpdatable) {
191        mNativeDict = openNative(path, startOffset, length, isUpdatable);
192    }
193
194    @Override
195    public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer,
196            final String prevWord, final ProximityInfo proximityInfo,
197            final boolean blockOffensiveWords, final int[] additionalFeaturesOptions) {
198        return getSuggestionsWithSessionId(composer, prevWord, proximityInfo, blockOffensiveWords,
199                additionalFeaturesOptions, 0 /* sessionId */);
200    }
201
202    @Override
203    public ArrayList<SuggestedWordInfo> getSuggestionsWithSessionId(final WordComposer composer,
204            final String prevWord, final ProximityInfo proximityInfo,
205            final boolean blockOffensiveWords, final int[] additionalFeaturesOptions,
206            final int sessionId) {
207        if (!isValidDictionary()) return null;
208
209        Arrays.fill(mInputCodePoints, Constants.NOT_A_CODE);
210        // TODO: toLowerCase in the native code
211        final int[] prevWordCodePointArray = (null == prevWord)
212                ? null : StringUtils.toCodePointArray(prevWord);
213        final int composerSize = composer.size();
214
215        final boolean isGesture = composer.isBatchMode();
216        if (composerSize <= 1 || !isGesture) {
217            if (composerSize > MAX_WORD_LENGTH - 1) return null;
218            for (int i = 0; i < composerSize; i++) {
219                mInputCodePoints[i] = composer.getCodeAt(i);
220            }
221        }
222
223        final InputPointers ips = composer.getInputPointers();
224        final int inputSize = isGesture ? ips.getPointerSize() : composerSize;
225        mNativeSuggestOptions.setIsGesture(isGesture);
226        mNativeSuggestOptions.setAdditionalFeaturesOptions(additionalFeaturesOptions);
227        // proximityInfo and/or prevWordForBigrams may not be null.
228        final int count = getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(),
229                getTraverseSession(sessionId).getSession(), ips.getXCoordinates(),
230                ips.getYCoordinates(), ips.getTimes(), ips.getPointerIds(), mInputCodePoints,
231                inputSize, 0 /* commitPoint */, mNativeSuggestOptions.getOptions(),
232                prevWordCodePointArray, mOutputCodePoints, mOutputScores, mSpaceIndices,
233                mOutputTypes, mOutputAutoCommitFirstWordConfidence);
234        final ArrayList<SuggestedWordInfo> suggestions = CollectionUtils.newArrayList();
235        for (int j = 0; j < count; ++j) {
236            final int start = j * MAX_WORD_LENGTH;
237            int len = 0;
238            while (len < MAX_WORD_LENGTH && mOutputCodePoints[start + len] != 0) {
239                ++len;
240            }
241            if (len > 0) {
242                final int flags = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_FLAGS;
243                if (blockOffensiveWords
244                        && 0 != (flags & SuggestedWordInfo.KIND_FLAG_POSSIBLY_OFFENSIVE)
245                        && 0 == (flags & SuggestedWordInfo.KIND_FLAG_EXACT_MATCH)) {
246                    // If we block potentially offensive words, and if the word is possibly
247                    // offensive, then we don't output it unless it's also an exact match.
248                    continue;
249                }
250                final int kind = mOutputTypes[j] & SuggestedWordInfo.KIND_MASK_KIND;
251                final int score = SuggestedWordInfo.KIND_WHITELIST == kind
252                        ? SuggestedWordInfo.MAX_SCORE : mOutputScores[j];
253                // TODO: check that all users of the `kind' parameter are ready to accept
254                // flags too and pass mOutputTypes[j] instead of kind
255                suggestions.add(new SuggestedWordInfo(new String(mOutputCodePoints, start, len),
256                        score, kind, this /* sourceDict */,
257                        mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */,
258                        mOutputAutoCommitFirstWordConfidence[0]));
259            }
260        }
261        return suggestions;
262    }
263
264    public boolean isValidDictionary() {
265        return mNativeDict != 0;
266    }
267
268    public int getFormatVersion() {
269        return getFormatVersionNative(mNativeDict);
270    }
271
272    public static float calcNormalizedScore(final String before, final String after,
273            final int score) {
274        return calcNormalizedScoreNative(StringUtils.toCodePointArray(before),
275                StringUtils.toCodePointArray(after), score);
276    }
277
278    public static int editDistance(final String before, final String after) {
279        if (before == null || after == null) {
280            throw new IllegalArgumentException();
281        }
282        return editDistanceNative(StringUtils.toCodePointArray(before),
283                StringUtils.toCodePointArray(after));
284    }
285
286    @Override
287    public boolean isValidWord(final String word) {
288        return getFrequency(word) != NOT_A_PROBABILITY;
289    }
290
291    @Override
292    public int getFrequency(final String word) {
293        if (word == null) return NOT_A_PROBABILITY;
294        int[] codePoints = StringUtils.toCodePointArray(word);
295        return getProbabilityNative(mNativeDict, codePoints);
296    }
297
298    // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
299    // calls when checking for changes in an entire dictionary.
300    public boolean isValidBigram(final String word0, final String word1) {
301        return getBigramProbability(word0, word1) != NOT_A_PROBABILITY;
302    }
303
304    public int getBigramProbability(final String word0, final String word1) {
305        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) return NOT_A_PROBABILITY;
306        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
307        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
308        return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1);
309    }
310
311    @UsedForTesting
312    public WordProperty getWordProperty(final String word) {
313        if (TextUtils.isEmpty(word)) {
314            return null;
315        }
316        final int[] codePoints = StringUtils.toCodePointArray(word);
317        final int[] outCodePoints = new int[MAX_WORD_LENGTH];
318        final boolean[] outFlags = new boolean[FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT];
319        final int[] outProbabilityInfo =
320                new int[FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT];
321        final ArrayList<int[]> outBigramTargets = CollectionUtils.newArrayList();
322        final ArrayList<int[]> outBigramProbabilityInfo = CollectionUtils.newArrayList();
323        final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList();
324        final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList();
325        getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbabilityInfo,
326                outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
327                outShortcutProbabilities);
328        return new WordProperty(codePoints,
329                outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
330                outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
331                outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX],
332                outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbabilityInfo,
333                outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
334                outShortcutProbabilities);
335    }
336
337    public static class GetNextWordPropertyResult {
338        public WordProperty mWordProperty;
339        public int mNextToken;
340
341        public GetNextWordPropertyResult(final WordProperty wordPreperty, final int nextToken) {
342            mWordProperty = wordPreperty;
343            mNextToken = nextToken;
344        }
345    }
346
347    /**
348     * Method to iterate all words in the dictionary for makedict.
349     * If token is 0, this method newly starts iterating the dictionary.
350     */
351    @UsedForTesting
352    public GetNextWordPropertyResult getNextWordProperty(final int token) {
353        final int[] codePoints = new int[MAX_WORD_LENGTH];
354        final int nextToken = getNextWordNative(mNativeDict, token, codePoints);
355        int len = 0;
356        // codePoints is null-terminated if its length is shorter than the array length.
357        while (len < MAX_WORD_LENGTH && codePoints[len] != 0) {
358            ++len;
359        }
360        final String word = new String(mOutputCodePoints, 0, len);
361        return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
362    }
363
364    // Add a unigram entry to binary dictionary with unigram attributes in native code.
365    public void addUnigramWord(final String word, final int probability,
366            final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord,
367            final boolean isBlacklisted, final int timestamp) {
368        if (TextUtils.isEmpty(word)) {
369            return;
370        }
371        final int[] codePoints = StringUtils.toCodePointArray(word);
372        final int[] shortcutTargetCodePoints = (shortcutTarget != null) ?
373                StringUtils.toCodePointArray(shortcutTarget) : null;
374        addUnigramWordNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints,
375                shortcutProbability, isNotAWord, isBlacklisted, timestamp);
376    }
377
378    // Add a bigram entry to binary dictionary with timestamp in native code.
379    public void addBigramWords(final String word0, final String word1, final int probability,
380            final int timestamp) {
381        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) {
382            return;
383        }
384        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
385        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
386        addBigramWordsNative(mNativeDict, codePoints0, codePoints1, probability, timestamp);
387    }
388
389    // Remove a bigram entry form binary dictionary in native code.
390    public void removeBigramWords(final String word0, final String word1) {
391        if (TextUtils.isEmpty(word0) || TextUtils.isEmpty(word1)) {
392            return;
393        }
394        final int[] codePoints0 = StringUtils.toCodePointArray(word0);
395        final int[] codePoints1 = StringUtils.toCodePointArray(word1);
396        removeBigramWordsNative(mNativeDict, codePoints0, codePoints1);
397    }
398
399    public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) {
400        if (!isValidDictionary()) return;
401        int processedParamCount = 0;
402        while (processedParamCount < languageModelParams.length) {
403            if (needsToRunGC(true /* mindsBlockByGC */)) {
404                flushWithGC();
405            }
406            processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict,
407                    languageModelParams, processedParamCount);
408            if (processedParamCount <= 0) {
409                return;
410            }
411        }
412    }
413
414    private void reopen() {
415        close();
416        final File dictFile = new File(mDictFilePath);
417        // WARNING: Because we pass 0 as the offset and file.length() as the length, this can
418        // only be called for actual files. Right now it's only called by the flush() family of
419        // functions, which require an updatable dictionary, so it's okay. But beware.
420        loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */,
421                dictFile.length(), true /* isUpdatable */);
422    }
423
424    public void flush() {
425        if (!isValidDictionary()) return;
426        flushNative(mNativeDict, mDictFilePath);
427        reopen();
428    }
429
430    public void flushWithGC() {
431        if (!isValidDictionary()) return;
432        flushWithGCNative(mNativeDict, mDictFilePath);
433        reopen();
434    }
435
436    /**
437     * Checks whether GC is needed to run or not.
438     * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about
439     * the blocking in some situations such as in idle time or just before closing.
440     * @return whether GC is needed to run or not.
441     */
442    public boolean needsToRunGC(final boolean mindsBlockByGC) {
443        if (!isValidDictionary()) return false;
444        return needsToRunGCNative(mNativeDict, mindsBlockByGC);
445    }
446
447    @UsedForTesting
448    public int calculateProbability(final int unigramProbability, final int bigramProbability) {
449        if (!isValidDictionary()) return NOT_A_PROBABILITY;
450        return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
451    }
452
453    /**
454     * Control the current time to be used in the native code. If currentTime >= 0, this method sets
455     * the current time and gets into test mode.
456     * In test mode, set timestamp is used as the current time in the native code.
457     * If currentTime < 0, quit the test mode and returns to using time() to get the current time.
458     *
459     * @param currentTime seconds since the unix epoch
460     * @return current time got in the native code.
461     */
462    @UsedForTesting
463    public static int setCurrentTimeForTest(final int currentTime) {
464        return setCurrentTimeForTestNative(currentTime);
465    }
466
467    @UsedForTesting
468    public String getPropertyForTest(final String query) {
469        if (!isValidDictionary()) return "";
470        return getPropertyNative(mNativeDict, query);
471    }
472
473    @Override
474    public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
475        return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT;
476    }
477
478    @Override
479    public void close() {
480        synchronized (mDicTraverseSessions) {
481            final int sessionsSize = mDicTraverseSessions.size();
482            for (int index = 0; index < sessionsSize; ++index) {
483                final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index);
484                if (traverseSession != null) {
485                    traverseSession.close();
486                }
487            }
488            mDicTraverseSessions.clear();
489        }
490        closeInternalLocked();
491    }
492
493    private synchronized void closeInternalLocked() {
494        if (mNativeDict != 0) {
495            closeNative(mNativeDict);
496            mNativeDict = 0;
497        }
498    }
499
500    // TODO: Manage BinaryDictionary instances without using WeakReference or something.
501    @Override
502    protected void finalize() throws Throwable {
503        try {
504            closeInternalLocked();
505        } finally {
506            super.finalize();
507        }
508    }
509}
510