LanguageModelParam.java revision a91561aa58db1c43092c1caecc051a11fa5391c7
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.utils; 18 19import android.util.Log; 20 21import com.android.inputmethod.latin.Dictionary; 22import com.android.inputmethod.latin.DictionaryFacilitator; 23import com.android.inputmethod.latin.PrevWordsInfo; 24import com.android.inputmethod.latin.settings.SpacingAndPunctuations; 25 26import java.util.ArrayList; 27import java.util.List; 28import java.util.Locale; 29 30// Note: this class is used as a parameter type of a native method. You should be careful when you 31// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). 32public final class LanguageModelParam { 33 private static final String TAG = LanguageModelParam.class.getSimpleName(); 34 private static final boolean DEBUG = false; 35 private static final boolean DEBUG_TOKEN = false; 36 37 // For now, these probability values are being referred to only when we add new entries to 38 // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or 39 // non-0. Thus, it's not meaningful to compare 10, 100, and so on. 40 // TODO: Revise the logic in ForgettingCurveUtils in native code. 41 private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; 42 private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; 43 private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10; 44 private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; 45 46 public final String mTargetWord; 47 public final int[] mWord0; 48 public final int[] mWord1; 49 // TODO: this needs to be a list of shortcuts 50 public final int[] mShortcutTarget; 51 public final int mUnigramProbability; 52 public final int mBigramProbability; 53 public final int mShortcutProbability; 54 public final boolean mIsNotAWord; 55 public final boolean mIsBlacklisted; 56 // Time stamp in seconds. 57 public final int mTimestamp; 58 59 // Constructor for unigram. TODO: support shortcuts 60 public LanguageModelParam(final String word, final int unigramProbability, 61 final int timestamp) { 62 this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp); 63 } 64 65 // Constructor for unigram and bigram. 66 public LanguageModelParam(final String word0, final String word1, 67 final int unigramProbability, final int bigramProbability, 68 final int timestamp) { 69 mTargetWord = word1; 70 mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0); 71 mWord1 = StringUtils.toCodePointArray(word1); 72 mShortcutTarget = null; 73 mUnigramProbability = unigramProbability; 74 mBigramProbability = bigramProbability; 75 mShortcutProbability = Dictionary.NOT_A_PROBABILITY; 76 mIsNotAWord = false; 77 mIsBlacklisted = false; 78 mTimestamp = timestamp; 79 } 80 81 // Process a list of words and return a list of {@link LanguageModelParam} objects. 82 public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom( 83 final List<String> tokens, final int timestamp, 84 final DictionaryFacilitator dictionaryFacilitator, 85 final SpacingAndPunctuations spacingAndPunctuations, 86 final DistracterFilter distracterFilter) { 87 final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>(); 88 final int N = tokens.size(); 89 PrevWordsInfo prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO; 90 for (int i = 0; i < N; ++i) { 91 final String tempWord = tokens.get(i); 92 if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { 93 // just skip this token 94 if (DEBUG_TOKEN) { 95 Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); 96 } 97 continue; 98 } 99 if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( 100 tempWord, spacingAndPunctuations)) { 101 if (DEBUG_TOKEN) { 102 Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" 103 + tempWord + "\""); 104 } 105 // Sentence terminator found. Split. 106 prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO; 107 continue; 108 } 109 if (DEBUG_TOKEN) { 110 Log.d(TAG, "--- word: \"" + tempWord + "\""); 111 } 112 final LanguageModelParam languageModelParam = 113 detectWhetherVaildWordOrNotAndGetLanguageModelParam( 114 prevWordsInfo, tempWord, timestamp, dictionaryFacilitator, 115 distracterFilter); 116 if (languageModelParam == null) { 117 continue; 118 } 119 languageModelParams.add(languageModelParam); 120 prevWordsInfo = new PrevWordsInfo(languageModelParam.mTargetWord); 121 } 122 return languageModelParams; 123 } 124 125 private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( 126 final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, 127 final DictionaryFacilitator dictionaryFacilitator, 128 final DistracterFilter distracterFilter) { 129 final Locale locale = dictionaryFacilitator.getLocale(); 130 if (locale == null) { 131 return null; 132 } 133 // TODO: Though targetWord is an IV (in-vocabulary) word, we should still apply 134 // distracterFilter in the following code. If targetWord is a distracter, 135 // it should be filtered out. 136 if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) { 137 return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, 138 true /* isValidWord */, locale); 139 } 140 141 final String lowerCaseTargetWord = targetWord.toLowerCase(locale); 142 if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) { 143 // Add the lower-cased word. 144 return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord, 145 timestamp, true /* isValidWord */, locale); 146 } 147 148 // Treat the word as an OOV word. The following statement checks whether this OOV 149 // is a distracter to words in dictionaries. Being a distracter means the OOV word is 150 // too close to a common word in dictionaries (e.g., the OOV "mot" is very close to "not"). 151 // Adding such a word to dictonaries would interfere with entering in-dictionary words. For 152 // example, adding "mot" to dictionaries might interfere with entering "not". 153 // This kind of OOV should be filtered out. 154 if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, targetWord, locale)) { 155 return null; 156 } 157 return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, 158 false /* isValidWord */, locale); 159 } 160 161 private static LanguageModelParam createAndGetLanguageModelParamOfWord( 162 final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, 163 final boolean isValidWord, final Locale locale) { 164 final String word; 165 if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST 166 && prevWordsInfo.mPrevWord == null && !isValidWord) { 167 word = targetWord.toLowerCase(locale); 168 } else { 169 word = targetWord; 170 } 171 final int unigramProbability = isValidWord ? 172 UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; 173 if (prevWordsInfo.mPrevWord == null) { 174 if (DEBUG) { 175 Log.d(TAG, "--- add unigram: current(" 176 + (isValidWord ? "Valid" : "OOV") + ") = " + word); 177 } 178 return new LanguageModelParam(word, unigramProbability, timestamp); 179 } 180 if (DEBUG) { 181 Log.d(TAG, "--- add bigram: prev = " + prevWordsInfo.mPrevWord + ", current(" 182 + (isValidWord ? "Valid" : "OOV") + ") = " + word); 183 } 184 final int bigramProbability = isValidWord ? 185 BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; 186 return new LanguageModelParam(prevWordsInfo.mPrevWord, word, unigramProbability, 187 bigramProbability, timestamp); 188 } 189} 190