LanguageModelParam.java revision 86f36003fd4397143bd37938dda029e5707634af
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.utils; 18 19import android.util.Log; 20 21import com.android.inputmethod.latin.Dictionary; 22import com.android.inputmethod.latin.DictionaryFacilitator; 23import com.android.inputmethod.latin.PrevWordsInfo; 24import com.android.inputmethod.latin.settings.SpacingAndPunctuations; 25 26import java.util.ArrayList; 27import java.util.List; 28import java.util.Locale; 29 30// Note: this class is used as a parameter type of a native method. You should be careful when you 31// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). 32public final class LanguageModelParam { 33 private static final String TAG = LanguageModelParam.class.getSimpleName(); 34 private static final boolean DEBUG = false; 35 private static final boolean DEBUG_TOKEN = false; 36 37 // For now, these probability values are being referred to only when we add new entries to 38 // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or 39 // non-0. Thus, it's not meaningful to compare 10, 100, and so on. 40 // TODO: Revise the logic in ForgettingCurveUtils in native code. 41 private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; 42 private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; 43 private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10; 44 private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; 45 46 public final CharSequence mTargetWord; 47 public final int[] mWord0; 48 public final int[] mWord1; 49 // TODO: this needs to be a list of shortcuts 50 public final int[] mShortcutTarget; 51 public final int mUnigramProbability; 52 public final int mBigramProbability; 53 public final int mShortcutProbability; 54 public final boolean mIsNotAWord; 55 public final boolean mIsBlacklisted; 56 // Time stamp in seconds. 57 public final int mTimestamp; 58 59 // Constructor for unigram. TODO: support shortcuts 60 public LanguageModelParam(final CharSequence word, final int unigramProbability, 61 final int timestamp) { 62 this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp); 63 } 64 65 // Constructor for unigram and bigram. 66 public LanguageModelParam(final CharSequence word0, final CharSequence word1, 67 final int unigramProbability, final int bigramProbability, 68 final int timestamp) { 69 mTargetWord = word1; 70 mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0); 71 mWord1 = StringUtils.toCodePointArray(word1); 72 mShortcutTarget = null; 73 mUnigramProbability = unigramProbability; 74 mBigramProbability = bigramProbability; 75 mShortcutProbability = Dictionary.NOT_A_PROBABILITY; 76 mIsNotAWord = false; 77 mIsBlacklisted = false; 78 mTimestamp = timestamp; 79 } 80 81 // Process a list of words and return a list of {@link LanguageModelParam} objects. 82 public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom( 83 final List<String> tokens, final int timestamp, 84 final DictionaryFacilitator dictionaryFacilitator, 85 final SpacingAndPunctuations spacingAndPunctuations, 86 final DistracterFilter distracterFilter) { 87 final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>(); 88 final int N = tokens.size(); 89 PrevWordsInfo prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO; 90 for (int i = 0; i < N; ++i) { 91 final String tempWord = tokens.get(i); 92 if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { 93 // just skip this token 94 if (DEBUG_TOKEN) { 95 Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); 96 } 97 continue; 98 } 99 if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( 100 tempWord, spacingAndPunctuations)) { 101 if (DEBUG_TOKEN) { 102 Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" 103 + tempWord + "\""); 104 } 105 // Sentence terminator found. Split. 106 prevWordsInfo = PrevWordsInfo.EMPTY_PREV_WORDS_INFO; 107 continue; 108 } 109 if (DEBUG_TOKEN) { 110 Log.d(TAG, "--- word: \"" + tempWord + "\""); 111 } 112 final LanguageModelParam languageModelParam = 113 detectWhetherVaildWordOrNotAndGetLanguageModelParam( 114 prevWordsInfo, tempWord, timestamp, dictionaryFacilitator, 115 distracterFilter); 116 if (languageModelParam == null) { 117 continue; 118 } 119 languageModelParams.add(languageModelParam); 120 prevWordsInfo = prevWordsInfo.getNextPrevWordsInfo( 121 new PrevWordsInfo.WordInfo(tempWord)); 122 } 123 return languageModelParams; 124 } 125 126 private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( 127 final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, 128 final DictionaryFacilitator dictionaryFacilitator, 129 final DistracterFilter distracterFilter) { 130 final Locale locale = dictionaryFacilitator.getLocale(); 131 if (locale == null) { 132 return null; 133 } 134 if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) { 135 return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, 136 true /* isValidWord */, locale, distracterFilter); 137 } 138 139 final String lowerCaseTargetWord = targetWord.toLowerCase(locale); 140 if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) { 141 // Add the lower-cased word. 142 return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord, 143 timestamp, true /* isValidWord */, locale, distracterFilter); 144 } 145 146 // Treat the word as an OOV word. 147 return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, 148 false /* isValidWord */, locale, distracterFilter); 149 } 150 151 private static LanguageModelParam createAndGetLanguageModelParamOfWord( 152 final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, 153 final boolean isValidWord, final Locale locale, 154 final DistracterFilter distracterFilter) { 155 final String word; 156 if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST 157 && !prevWordsInfo.isValid() && !isValidWord) { 158 word = targetWord.toLowerCase(locale); 159 } else { 160 word = targetWord; 161 } 162 // Check whether the word is a distracter to words in the dictionaries. 163 if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, word, locale)) { 164 if (DEBUG) { 165 Log.d(TAG, "The word (" + word + ") is a distracter. Skip this word."); 166 } 167 return null; 168 } 169 final int unigramProbability = isValidWord ? 170 UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; 171 if (!prevWordsInfo.isValid()) { 172 if (DEBUG) { 173 Log.d(TAG, "--- add unigram: current(" 174 + (isValidWord ? "Valid" : "OOV") + ") = " + word); 175 } 176 return new LanguageModelParam(word, unigramProbability, timestamp); 177 } 178 if (DEBUG) { 179 Log.d(TAG, "--- add bigram: prev = " + prevWordsInfo + ", current(" 180 + (isValidWord ? "Valid" : "OOV") + ") = " + word); 181 } 182 final int bigramProbability = isValidWord ? 183 BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; 184 return new LanguageModelParam(prevWordsInfo.mPrevWordsInfo[0].mWord, word, 185 unigramProbability, bigramProbability, timestamp); 186 } 187} 188