LanguageModelParam.java revision e708b1bc2e11285ad404133b8de21719ce08acb5
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.utils;
18
19import android.util.Log;
20
21import com.android.inputmethod.latin.Dictionary;
22import com.android.inputmethod.latin.DictionaryFacilitator;
23import com.android.inputmethod.latin.PrevWordsInfo;
24import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
25
26import java.util.ArrayList;
27import java.util.List;
28import java.util.Locale;
29
30// Note: this class is used as a parameter type of a native method. You should be careful when you
31// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
32public final class LanguageModelParam {
33    private static final String TAG = LanguageModelParam.class.getSimpleName();
34    private static final boolean DEBUG = false;
35    private static final boolean DEBUG_TOKEN = false;
36
37    // For now, these probability values are being referred to only when we add new entries to
38    // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or
39    // non-0. Thus, it's not meaningful to compare 10, 100, and so on.
40    // TODO: Revise the logic in ForgettingCurveUtils in native code.
41    private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100;
42    private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
43    private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10;
44    private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
45
46    public final String mTargetWord;
47    public final int[] mWord0;
48    public final int[] mWord1;
49    // TODO: this needs to be a list of shortcuts
50    public final int[] mShortcutTarget;
51    public final int mUnigramProbability;
52    public final int mBigramProbability;
53    public final int mShortcutProbability;
54    public final boolean mIsNotAWord;
55    public final boolean mIsBlacklisted;
56    // Time stamp in seconds.
57    public final int mTimestamp;
58
59    // Constructor for unigram. TODO: support shortcuts
60    public LanguageModelParam(final String word, final int unigramProbability,
61            final int timestamp) {
62        this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp);
63    }
64
65    // Constructor for unigram and bigram.
66    public LanguageModelParam(final String word0, final String word1,
67            final int unigramProbability, final int bigramProbability,
68            final int timestamp) {
69        mTargetWord = word1;
70        mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0);
71        mWord1 = StringUtils.toCodePointArray(word1);
72        mShortcutTarget = null;
73        mUnigramProbability = unigramProbability;
74        mBigramProbability = bigramProbability;
75        mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
76        mIsNotAWord = false;
77        mIsBlacklisted = false;
78        mTimestamp = timestamp;
79    }
80
81    // Process a list of words and return a list of {@link LanguageModelParam} objects.
82    public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom(
83            final List<String> tokens, final int timestamp,
84            final DictionaryFacilitator dictionaryFacilitator,
85            final SpacingAndPunctuations spacingAndPunctuations,
86            final DistracterFilter distracterFilter) {
87        final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>();
88        final int N = tokens.size();
89        PrevWordsInfo prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO;
90        for (int i = 0; i < N; ++i) {
91            final String tempWord = tokens.get(i);
92            if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
93                // just skip this token
94                if (DEBUG_TOKEN) {
95                    Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
96                }
97                continue;
98            }
99            if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
100                    tempWord, spacingAndPunctuations)) {
101                if (DEBUG_TOKEN) {
102                    Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
103                            + tempWord + "\"");
104                }
105                // Sentence terminator found. Split.
106                prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO;
107                continue;
108            }
109            if (DEBUG_TOKEN) {
110                Log.d(TAG, "--- word: \"" + tempWord + "\"");
111            }
112            final LanguageModelParam languageModelParam =
113                    detectWhetherVaildWordOrNotAndGetLanguageModelParam(
114                            prevWordsInfo, tempWord, timestamp, dictionaryFacilitator,
115                            distracterFilter);
116            if (languageModelParam == null) {
117                continue;
118            }
119            languageModelParams.add(languageModelParam);
120            prevWordsInfo = prevWordsInfo.getNextPrevWordsInfo(
121                    new PrevWordsInfo.WordInfo(tempWord));
122        }
123        return languageModelParams;
124    }
125
126    private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam(
127            final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp,
128            final DictionaryFacilitator dictionaryFacilitator,
129            final DistracterFilter distracterFilter) {
130        final Locale locale = dictionaryFacilitator.getLocale();
131        if (locale == null) {
132            return null;
133        }
134        if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) {
135            return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp,
136                    true /* isValidWord */, locale, distracterFilter);
137        }
138
139        final String lowerCaseTargetWord = targetWord.toLowerCase(locale);
140        if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) {
141            // Add the lower-cased word.
142            return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord,
143                    timestamp, true /* isValidWord */, locale, distracterFilter);
144        }
145
146        // Treat the word as an OOV word.
147        return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp,
148                false /* isValidWord */, locale, distracterFilter);
149    }
150
151    private static LanguageModelParam createAndGetLanguageModelParamOfWord(
152            final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp,
153            final boolean isValidWord, final Locale locale,
154            final DistracterFilter distracterFilter) {
155        final String word;
156        if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST
157                && !prevWordsInfo.isValid() && !isValidWord) {
158            word = targetWord.toLowerCase(locale);
159        } else {
160            word = targetWord;
161        }
162        // Check whether the word is a distracter to words in the dictionaries.
163        if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, word, locale)) {
164            if (DEBUG) {
165                Log.d(TAG, "The word (" + word + ") is a distracter. Skip this word.");
166            }
167            return null;
168        }
169        final int unigramProbability = isValidWord ?
170                UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD;
171        if (!prevWordsInfo.isValid()) {
172            if (DEBUG) {
173                Log.d(TAG, "--- add unigram: current("
174                        + (isValidWord ? "Valid" : "OOV") + ") = " + word);
175            }
176            return new LanguageModelParam(word, unigramProbability, timestamp);
177        }
178        if (DEBUG) {
179            Log.d(TAG, "--- add bigram: prev = " + prevWordsInfo + ", current("
180                    + (isValidWord ? "Valid" : "OOV") + ") = " + word);
181        }
182        final int bigramProbability = isValidWord ?
183                BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD;
184        return new LanguageModelParam(prevWordsInfo.mPrevWordsInfo[0].mWord, word,
185                unigramProbability, bigramProbability, timestamp);
186    }
187}
188