NgramContext.java revision a568e0acb4a45707e554f63aede917bfa46b9dba
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin; 18 19import android.text.TextUtils; 20 21import com.android.inputmethod.annotations.UsedForTesting; 22import com.android.inputmethod.latin.common.StringUtils; 23import com.android.inputmethod.latin.define.DecoderSpecificConstants; 24 25import java.util.ArrayList; 26import java.util.Arrays; 27 28import javax.annotation.Nonnull; 29 30/** 31 * Class to represent information of previous words. This class is used to add n-gram entries 32 * into binary dictionaries, to get predictions, and to get suggestions. 33 */ 34public class NgramContext { 35 @Nonnull 36 public static final NgramContext EMPTY_PREV_WORDS_INFO = 37 new NgramContext(WordInfo.EMPTY_WORD_INFO); 38 @Nonnull 39 public static final NgramContext BEGINNING_OF_SENTENCE = 40 new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO); 41 42 public static final String BEGINNING_OF_SENTENCE_TAG = "<S>"; 43 44 public static final String CONTEXT_SEPARATOR = " "; 45 46 /** 47 * Word information used to represent previous words information. 48 */ 49 public static class WordInfo { 50 @Nonnull 51 public static final WordInfo EMPTY_WORD_INFO = new WordInfo(null); 52 @Nonnull 53 public static final WordInfo BEGINNING_OF_SENTENCE_WORD_INFO = new WordInfo(); 54 55 // This is an empty char sequence when mIsBeginningOfSentence is true. 56 public final CharSequence mWord; 57 // TODO: Have sentence separator. 58 // Whether the current context is beginning of sentence or not. This is true when composing 59 // at the beginning of an input field or composing a word after a sentence separator. 60 public final boolean mIsBeginningOfSentence; 61 62 // Beginning of sentence. 63 private WordInfo() { 64 mWord = ""; 65 mIsBeginningOfSentence = true; 66 } 67 68 public WordInfo(final CharSequence word) { 69 mWord = word; 70 mIsBeginningOfSentence = false; 71 } 72 73 public boolean isValid() { 74 return mWord != null; 75 } 76 77 @Override 78 public int hashCode() { 79 return Arrays.hashCode(new Object[] { mWord, mIsBeginningOfSentence } ); 80 } 81 82 @Override 83 public boolean equals(Object o) { 84 if (this == o) return true; 85 if (!(o instanceof WordInfo)) return false; 86 final WordInfo wordInfo = (WordInfo)o; 87 if (mWord == null || wordInfo.mWord == null) { 88 return mWord == wordInfo.mWord 89 && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence; 90 } 91 return TextUtils.equals(mWord, wordInfo.mWord) 92 && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence; 93 } 94 } 95 96 // The words immediately before the considered word. EMPTY_WORD_INFO element means we don't 97 // have any context for that previous word including the "beginning of sentence context" - we 98 // just don't know what to predict using the information. An example of that is after a comma. 99 // For simplicity of implementation, elements may also be EMPTY_WORD_INFO transiently after the 100 // WordComposer was reset and before starting a new composing word, but we should never be 101 // calling getSuggetions* in this situation. 102 private final WordInfo[] mPrevWordsInfo; 103 private final int mPrevWordsCount; 104 105 // Construct from the previous word information. 106 public NgramContext(final WordInfo... prevWordsInfo) { 107 mPrevWordsInfo = prevWordsInfo; 108 mPrevWordsCount = prevWordsInfo.length; 109 } 110 111 /** 112 * Create next prevWordsInfo using current prevWordsInfo. 113 */ 114 @Nonnull 115 public NgramContext getNextNgramContext(final WordInfo wordInfo) { 116 final int nextPrevWordCount = Math.min( 117 DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM, mPrevWordsCount + 1); 118 final WordInfo[] prevWordsInfo = new WordInfo[nextPrevWordCount]; 119 prevWordsInfo[0] = wordInfo; 120 System.arraycopy(mPrevWordsInfo, 0, prevWordsInfo, 1, nextPrevWordCount - 1); 121 return new NgramContext(prevWordsInfo); 122 } 123 124 125 /** 126 * Extracts the previous words context. 127 * 128 * @return a String with the previous words separated by white space. 129 */ 130 public String extractPrevWordsContext() { 131 final ArrayList<String> terms = new ArrayList<>(); 132 for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) { 133 if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) { 134 final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i]; 135 if (wordInfo.mIsBeginningOfSentence) { 136 terms.add(BEGINNING_OF_SENTENCE_TAG); 137 } else { 138 final String term = wordInfo.mWord.toString(); 139 if (!term.isEmpty()) { 140 terms.add(term); 141 } 142 } 143 } 144 } 145 return terms.size() == 0 ? BEGINNING_OF_SENTENCE_TAG 146 : TextUtils.join(CONTEXT_SEPARATOR, terms); 147 } 148 149 /** 150 * Extracts the previous words context. 151 * 152 * @return a String array with the previous words. 153 */ 154 public String[] extractPrevWordsContextArray() { 155 final ArrayList<String> prevTermList = new ArrayList<>(); 156 for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) { 157 if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) { 158 final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i]; 159 if (wordInfo.mIsBeginningOfSentence) { 160 prevTermList.add(BEGINNING_OF_SENTENCE_TAG); 161 } else { 162 final String term = wordInfo.mWord.toString(); 163 if (!term.isEmpty()) { 164 prevTermList.add(term); 165 } 166 } 167 } 168 } 169 final String[] contextStringArray = prevTermList.size() == 0 ? 170 new String[] { BEGINNING_OF_SENTENCE_TAG } 171 : prevTermList.toArray(new String[prevTermList.size()]); 172 return contextStringArray; 173 } 174 175 public boolean isValid() { 176 return mPrevWordsCount > 0 && mPrevWordsInfo[0].isValid(); 177 } 178 179 public boolean isBeginningOfSentenceContext() { 180 return mPrevWordsCount > 0 && mPrevWordsInfo[0].mIsBeginningOfSentence; 181 } 182 183 // n is 1-indexed. 184 // TODO: Remove 185 public CharSequence getNthPrevWord(final int n) { 186 if (n <= 0 || n > mPrevWordsCount) { 187 return null; 188 } 189 return mPrevWordsInfo[n - 1].mWord; 190 } 191 192 // n is 1-indexed. 193 @UsedForTesting 194 public boolean isNthPrevWordBeginningOfSentence(final int n) { 195 if (n <= 0 || n > mPrevWordsCount) { 196 return false; 197 } 198 return mPrevWordsInfo[n - 1].mIsBeginningOfSentence; 199 } 200 201 public void outputToArray(final int[][] codePointArrays, 202 final boolean[] isBeginningOfSentenceArray) { 203 for (int i = 0; i < mPrevWordsCount; i++) { 204 final WordInfo wordInfo = mPrevWordsInfo[i]; 205 if (wordInfo == null || !wordInfo.isValid()) { 206 codePointArrays[i] = new int[0]; 207 isBeginningOfSentenceArray[i] = false; 208 continue; 209 } 210 codePointArrays[i] = StringUtils.toCodePointArray(wordInfo.mWord); 211 isBeginningOfSentenceArray[i] = wordInfo.mIsBeginningOfSentence; 212 } 213 } 214 215 public int getPrevWordCount() { 216 return mPrevWordsCount; 217 } 218 219 @Override 220 public int hashCode() { 221 int hashValue = 0; 222 for (final WordInfo wordInfo : mPrevWordsInfo) { 223 if (wordInfo == null || !WordInfo.EMPTY_WORD_INFO.equals(wordInfo)) { 224 break; 225 } 226 hashValue ^= wordInfo.hashCode(); 227 } 228 return hashValue; 229 } 230 231 @Override 232 public boolean equals(Object o) { 233 if (this == o) return true; 234 if (!(o instanceof NgramContext)) return false; 235 final NgramContext prevWordsInfo = (NgramContext)o; 236 237 final int minLength = Math.min(mPrevWordsCount, prevWordsInfo.mPrevWordsCount); 238 for (int i = 0; i < minLength; i++) { 239 if (!mPrevWordsInfo[i].equals(prevWordsInfo.mPrevWordsInfo[i])) { 240 return false; 241 } 242 } 243 final WordInfo[] longerWordsInfo; 244 final int longerWordsInfoCount; 245 if (mPrevWordsCount > prevWordsInfo.mPrevWordsCount) { 246 longerWordsInfo = mPrevWordsInfo; 247 longerWordsInfoCount = mPrevWordsCount; 248 } else { 249 longerWordsInfo = prevWordsInfo.mPrevWordsInfo; 250 longerWordsInfoCount = prevWordsInfo.mPrevWordsCount; 251 } 252 for (int i = minLength; i < longerWordsInfoCount; i++) { 253 if (longerWordsInfo[i] != null 254 && !WordInfo.EMPTY_WORD_INFO.equals(longerWordsInfo[i])) { 255 return false; 256 } 257 } 258 return true; 259 } 260 261 @Override 262 public String toString() { 263 final StringBuffer builder = new StringBuffer(); 264 for (int i = 0; i < mPrevWordsCount; i++) { 265 final WordInfo wordInfo = mPrevWordsInfo[i]; 266 builder.append("PrevWord["); 267 builder.append(i); 268 builder.append("]: "); 269 if (wordInfo == null) { 270 builder.append("null. "); 271 continue; 272 } 273 if (!wordInfo.isValid()) { 274 builder.append("Empty. "); 275 continue; 276 } 277 builder.append(wordInfo.mWord); 278 builder.append(", isBeginningOfSentence: "); 279 builder.append(wordInfo.mIsBeginningOfSentence); 280 builder.append(". "); 281 } 282 return builder.toString(); 283 } 284} 285