NgramContext.java revision a568e0acb4a45707e554f63aede917bfa46b9dba
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import android.text.TextUtils;
20
21import com.android.inputmethod.annotations.UsedForTesting;
22import com.android.inputmethod.latin.common.StringUtils;
23import com.android.inputmethod.latin.define.DecoderSpecificConstants;
24
25import java.util.ArrayList;
26import java.util.Arrays;
27
28import javax.annotation.Nonnull;
29
30/**
31 * Class to represent information of previous words. This class is used to add n-gram entries
32 * into binary dictionaries, to get predictions, and to get suggestions.
33 */
34public class NgramContext {
35    @Nonnull
36    public static final NgramContext EMPTY_PREV_WORDS_INFO =
37            new NgramContext(WordInfo.EMPTY_WORD_INFO);
38    @Nonnull
39    public static final NgramContext BEGINNING_OF_SENTENCE =
40            new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO);
41
42    public static final String BEGINNING_OF_SENTENCE_TAG = "<S>";
43
44    public static final String CONTEXT_SEPARATOR = " ";
45
46    /**
47     * Word information used to represent previous words information.
48     */
49    public static class WordInfo {
50        @Nonnull
51        public static final WordInfo EMPTY_WORD_INFO = new WordInfo(null);
52        @Nonnull
53        public static final WordInfo BEGINNING_OF_SENTENCE_WORD_INFO = new WordInfo();
54
55        // This is an empty char sequence when mIsBeginningOfSentence is true.
56        public final CharSequence mWord;
57        // TODO: Have sentence separator.
58        // Whether the current context is beginning of sentence or not. This is true when composing
59        // at the beginning of an input field or composing a word after a sentence separator.
60        public final boolean mIsBeginningOfSentence;
61
62        // Beginning of sentence.
63        private WordInfo() {
64            mWord = "";
65            mIsBeginningOfSentence = true;
66        }
67
68        public WordInfo(final CharSequence word) {
69            mWord = word;
70            mIsBeginningOfSentence = false;
71        }
72
73        public boolean isValid() {
74            return mWord != null;
75        }
76
77        @Override
78        public int hashCode() {
79            return Arrays.hashCode(new Object[] { mWord, mIsBeginningOfSentence } );
80        }
81
82        @Override
83        public boolean equals(Object o) {
84            if (this == o) return true;
85            if (!(o instanceof WordInfo)) return false;
86            final WordInfo wordInfo = (WordInfo)o;
87            if (mWord == null || wordInfo.mWord == null) {
88                return mWord == wordInfo.mWord
89                        && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence;
90            }
91            return TextUtils.equals(mWord, wordInfo.mWord)
92                    && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence;
93        }
94    }
95
96    // The words immediately before the considered word. EMPTY_WORD_INFO element means we don't
97    // have any context for that previous word including the "beginning of sentence context" - we
98    // just don't know what to predict using the information. An example of that is after a comma.
99    // For simplicity of implementation, elements may also be EMPTY_WORD_INFO transiently after the
100    // WordComposer was reset and before starting a new composing word, but we should never be
101    // calling getSuggetions* in this situation.
102    private final WordInfo[] mPrevWordsInfo;
103    private final int mPrevWordsCount;
104
105    // Construct from the previous word information.
106    public NgramContext(final WordInfo... prevWordsInfo) {
107        mPrevWordsInfo = prevWordsInfo;
108        mPrevWordsCount = prevWordsInfo.length;
109    }
110
111    /**
112     * Create next prevWordsInfo using current prevWordsInfo.
113     */
114    @Nonnull
115    public NgramContext getNextNgramContext(final WordInfo wordInfo) {
116        final int nextPrevWordCount = Math.min(
117                DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM, mPrevWordsCount + 1);
118        final WordInfo[] prevWordsInfo = new WordInfo[nextPrevWordCount];
119        prevWordsInfo[0] = wordInfo;
120        System.arraycopy(mPrevWordsInfo, 0, prevWordsInfo, 1, nextPrevWordCount - 1);
121        return new NgramContext(prevWordsInfo);
122    }
123
124
125    /**
126     * Extracts the previous words context.
127     *
128     * @return a String with the previous words separated by white space.
129     */
130    public String extractPrevWordsContext() {
131        final ArrayList<String> terms = new ArrayList<>();
132        for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) {
133            if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) {
134                final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i];
135                if (wordInfo.mIsBeginningOfSentence) {
136                    terms.add(BEGINNING_OF_SENTENCE_TAG);
137                } else {
138                    final String term = wordInfo.mWord.toString();
139                    if (!term.isEmpty()) {
140                        terms.add(term);
141                    }
142                }
143            }
144        }
145        return terms.size() == 0 ? BEGINNING_OF_SENTENCE_TAG
146                : TextUtils.join(CONTEXT_SEPARATOR, terms);
147    }
148
149    /**
150     * Extracts the previous words context.
151     *
152     * @return a String array with the previous words.
153     */
154    public String[] extractPrevWordsContextArray() {
155        final ArrayList<String> prevTermList = new ArrayList<>();
156        for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) {
157            if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) {
158                final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i];
159                if (wordInfo.mIsBeginningOfSentence) {
160                    prevTermList.add(BEGINNING_OF_SENTENCE_TAG);
161                } else {
162                    final String term = wordInfo.mWord.toString();
163                    if (!term.isEmpty()) {
164                        prevTermList.add(term);
165                    }
166                }
167            }
168        }
169        final String[] contextStringArray = prevTermList.size() == 0 ?
170                new String[] { BEGINNING_OF_SENTENCE_TAG }
171                : prevTermList.toArray(new String[prevTermList.size()]);
172        return contextStringArray;
173    }
174
175    public boolean isValid() {
176        return mPrevWordsCount > 0 && mPrevWordsInfo[0].isValid();
177    }
178
179    public boolean isBeginningOfSentenceContext() {
180        return mPrevWordsCount > 0 && mPrevWordsInfo[0].mIsBeginningOfSentence;
181    }
182
183    // n is 1-indexed.
184    // TODO: Remove
185    public CharSequence getNthPrevWord(final int n) {
186        if (n <= 0 || n > mPrevWordsCount) {
187            return null;
188        }
189        return mPrevWordsInfo[n - 1].mWord;
190    }
191
192    // n is 1-indexed.
193    @UsedForTesting
194    public boolean isNthPrevWordBeginningOfSentence(final int n) {
195        if (n <= 0 || n > mPrevWordsCount) {
196            return false;
197        }
198        return mPrevWordsInfo[n - 1].mIsBeginningOfSentence;
199    }
200
201    public void outputToArray(final int[][] codePointArrays,
202            final boolean[] isBeginningOfSentenceArray) {
203        for (int i = 0; i < mPrevWordsCount; i++) {
204            final WordInfo wordInfo = mPrevWordsInfo[i];
205            if (wordInfo == null || !wordInfo.isValid()) {
206                codePointArrays[i] = new int[0];
207                isBeginningOfSentenceArray[i] = false;
208                continue;
209            }
210            codePointArrays[i] = StringUtils.toCodePointArray(wordInfo.mWord);
211            isBeginningOfSentenceArray[i] = wordInfo.mIsBeginningOfSentence;
212        }
213    }
214
215    public int getPrevWordCount() {
216        return mPrevWordsCount;
217    }
218
219    @Override
220    public int hashCode() {
221        int hashValue = 0;
222        for (final WordInfo wordInfo : mPrevWordsInfo) {
223            if (wordInfo == null || !WordInfo.EMPTY_WORD_INFO.equals(wordInfo)) {
224                break;
225            }
226            hashValue ^= wordInfo.hashCode();
227        }
228        return hashValue;
229    }
230
231    @Override
232    public boolean equals(Object o) {
233        if (this == o) return true;
234        if (!(o instanceof NgramContext)) return false;
235        final NgramContext prevWordsInfo = (NgramContext)o;
236
237        final int minLength = Math.min(mPrevWordsCount, prevWordsInfo.mPrevWordsCount);
238        for (int i = 0; i < minLength; i++) {
239            if (!mPrevWordsInfo[i].equals(prevWordsInfo.mPrevWordsInfo[i])) {
240                return false;
241            }
242        }
243        final WordInfo[] longerWordsInfo;
244        final int longerWordsInfoCount;
245        if (mPrevWordsCount > prevWordsInfo.mPrevWordsCount) {
246            longerWordsInfo = mPrevWordsInfo;
247            longerWordsInfoCount = mPrevWordsCount;
248        } else {
249            longerWordsInfo = prevWordsInfo.mPrevWordsInfo;
250            longerWordsInfoCount = prevWordsInfo.mPrevWordsCount;
251        }
252        for (int i = minLength; i < longerWordsInfoCount; i++) {
253            if (longerWordsInfo[i] != null
254                    && !WordInfo.EMPTY_WORD_INFO.equals(longerWordsInfo[i])) {
255                return false;
256            }
257        }
258        return true;
259    }
260
261    @Override
262    public String toString() {
263        final StringBuffer builder = new StringBuffer();
264        for (int i = 0; i < mPrevWordsCount; i++) {
265            final WordInfo wordInfo = mPrevWordsInfo[i];
266            builder.append("PrevWord[");
267            builder.append(i);
268            builder.append("]: ");
269            if (wordInfo == null) {
270                builder.append("null. ");
271                continue;
272            }
273            if (!wordInfo.isValid()) {
274                builder.append("Empty. ");
275                continue;
276            }
277            builder.append(wordInfo.mWord);
278            builder.append(", isBeginningOfSentence: ");
279            builder.append(wordInfo.mIsBeginningOfSentence);
280            builder.append(". ");
281        }
282        return builder.toString();
283    }
284}
285