inputmethod/latin/NgramContext.java

/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin;

import android.text.TextUtils;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.common.StringUtils;
import com.android.inputmethod.latin.define.DecoderSpecificConstants;

import java.util.ArrayList;
import java.util.Arrays;

import javax.annotation.Nonnull;

/**
 * Class to represent information of previous words. This class is used to add n-gram entries
 * into binary dictionaries, to get predictions, and to get suggestions.
 */
public class NgramContext {
    @Nonnull
    public static final NgramContext EMPTY_PREV_WORDS_INFO =
            new NgramContext(WordInfo.EMPTY_WORD_INFO);
    @Nonnull
    public static final NgramContext BEGINNING_OF_SENTENCE =
            new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO);

    public static final String BEGINNING_OF_SENTENCE_TAG = "<S>";

    public static final String CONTEXT_SEPARATOR = " ";

    /**
     * Word information used to represent previous words information.
     */
    public static class WordInfo {
        @Nonnull
        public static final WordInfo EMPTY_WORD_INFO = new WordInfo(null);
        @Nonnull
        public static final WordInfo BEGINNING_OF_SENTENCE_WORD_INFO = new WordInfo();

        // This is an empty char sequence when mIsBeginningOfSentence is true.
        public final CharSequence mWord;
        // TODO: Have sentence separator.
        // Whether the current context is beginning of sentence or not. This is true when composing
        // at the beginning of an input field or composing a word after a sentence separator.
        public final boolean mIsBeginningOfSentence;

        // Beginning of sentence.
        private WordInfo() {
            mWord = "";
            mIsBeginningOfSentence = true;
        }

        public WordInfo(final CharSequence word) {
            mWord = word;
            mIsBeginningOfSentence = false;
        }

        public boolean isValid() {
            return mWord != null;
        }

        @Override
        public int hashCode() {
            return Arrays.hashCode(new Object[] { mWord, mIsBeginningOfSentence } );
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (!(o instanceof WordInfo)) return false;
            final WordInfo wordInfo = (WordInfo)o;
            if (mWord == null || wordInfo.mWord == null) {
                return mWord == wordInfo.mWord
                        && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence;
            }
            return TextUtils.equals(mWord, wordInfo.mWord)
                    && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence;
        }
    }

    // The words immediately before the considered word. EMPTY_WORD_INFO element means we don't
    // have any context for that previous word including the "beginning of sentence context" - we
    // just don't know what to predict using the information. An example of that is after a comma.
    // For simplicity of implementation, elements may also be EMPTY_WORD_INFO transiently after the
    // WordComposer was reset and before starting a new composing word, but we should never be
    // calling getSuggetions* in this situation.
    private final WordInfo[] mPrevWordsInfo;
    private final int mPrevWordsCount;

    // Construct from the previous word information.
    public NgramContext(final WordInfo... prevWordsInfo) {
        mPrevWordsInfo = prevWordsInfo;
        mPrevWordsCount = prevWordsInfo.length;
    }

    /**
     * Create next prevWordsInfo using current prevWordsInfo.
     */
    @Nonnull
    public NgramContext getNextNgramContext(final WordInfo wordInfo) {
        final int nextPrevWordCount = Math.min(
                DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM, mPrevWordsCount + 1);
        final WordInfo[] prevWordsInfo = new WordInfo[nextPrevWordCount];
        prevWordsInfo[0] = wordInfo;
        System.arraycopy(mPrevWordsInfo, 0, prevWordsInfo, 1, nextPrevWordCount - 1);
        return new NgramContext(prevWordsInfo);
    }


    /**
     * Extracts the previous words context.
     *
     * @return a String with the previous words separated by white space.
     */
    public String extractPrevWordsContext() {
        final ArrayList<String> terms = new ArrayList<>();
        for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) {
            if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) {
                final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i];
                if (wordInfo.mIsBeginningOfSentence) {
                    terms.add(BEGINNING_OF_SENTENCE_TAG);
                } else {
                    final String term = wordInfo.mWord.toString();
                    if (!term.isEmpty()) {
                        terms.add(term);
                    }
                }
            }
        }
        return terms.size() == 0 ? BEGINNING_OF_SENTENCE_TAG
                : TextUtils.join(CONTEXT_SEPARATOR, terms);
    }

    /**
     * Extracts the previous words context.
     *
     * @return a String array with the previous words.
     */
    public String[] extractPrevWordsContextArray() {
        final ArrayList<String> prevTermList = new ArrayList<>();
        for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) {
            if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) {
                final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i];
                if (wordInfo.mIsBeginningOfSentence) {
                    prevTermList.add(BEGINNING_OF_SENTENCE_TAG);
                } else {
                    final String term = wordInfo.mWord.toString();
                    if (!term.isEmpty()) {
                        prevTermList.add(term);
                    }
                }
            }
        }
        final String[] contextStringArray = prevTermList.size() == 0 ?
                new String[] { BEGINNING_OF_SENTENCE_TAG }
                : prevTermList.toArray(new String[prevTermList.size()]);
        return contextStringArray;
    }

    public boolean isValid() {
        return mPrevWordsCount > 0 && mPrevWordsInfo[0].isValid();
    }

    public boolean isBeginningOfSentenceContext() {
        return mPrevWordsCount > 0 && mPrevWordsInfo[0].mIsBeginningOfSentence;
    }

    // n is 1-indexed.
    // TODO: Remove
    public CharSequence getNthPrevWord(final int n) {
        if (n <= 0 || n > mPrevWordsCount) {
            return null;
        }
        return mPrevWordsInfo[n - 1].mWord;
    }

    // n is 1-indexed.
    @UsedForTesting
    public boolean isNthPrevWordBeginningOfSentence(final int n) {
        if (n <= 0 || n > mPrevWordsCount) {
            return false;
        }
        return mPrevWordsInfo[n - 1].mIsBeginningOfSentence;
    }

    public void outputToArray(final int[][] codePointArrays,
            final boolean[] isBeginningOfSentenceArray) {
        for (int i = 0; i < mPrevWordsCount; i++) {
            final WordInfo wordInfo = mPrevWordsInfo[i];
            if (wordInfo == null || !wordInfo.isValid()) {
                codePointArrays[i] = new int[0];
                isBeginningOfSentenceArray[i] = false;
                continue;
            }
            codePointArrays[i] = StringUtils.toCodePointArray(wordInfo.mWord);
            isBeginningOfSentenceArray[i] = wordInfo.mIsBeginningOfSentence;
        }
    }

    public int getPrevWordCount() {
        return mPrevWordsCount;
    }

    @Override
    public int hashCode() {
        int hashValue = 0;
        for (final WordInfo wordInfo : mPrevWordsInfo) {
            if (wordInfo == null || !WordInfo.EMPTY_WORD_INFO.equals(wordInfo)) {
                break;
            }
            hashValue ^= wordInfo.hashCode();
        }
        return hashValue;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (!(o instanceof NgramContext)) return false;
        final NgramContext prevWordsInfo = (NgramContext)o;

        final int minLength = Math.min(mPrevWordsCount, prevWordsInfo.mPrevWordsCount);
        for (int i = 0; i < minLength; i++) {
            if (!mPrevWordsInfo[i].equals(prevWordsInfo.mPrevWordsInfo[i])) {
                return false;
            }
        }
        final WordInfo[] longerWordsInfo;
        final int longerWordsInfoCount;
        if (mPrevWordsCount > prevWordsInfo.mPrevWordsCount) {
            longerWordsInfo = mPrevWordsInfo;
            longerWordsInfoCount = mPrevWordsCount;
        } else {
            longerWordsInfo = prevWordsInfo.mPrevWordsInfo;
            longerWordsInfoCount = prevWordsInfo.mPrevWordsCount;
        }
        for (int i = minLength; i < longerWordsInfoCount; i++) {
            if (longerWordsInfo[i] != null
                    && !WordInfo.EMPTY_WORD_INFO.equals(longerWordsInfo[i])) {
                return false;
            }
        }
        return true;
    }

    @Override
    public String toString() {
        final StringBuffer builder = new StringBuffer();
        for (int i = 0; i < mPrevWordsCount; i++) {
            final WordInfo wordInfo = mPrevWordsInfo[i];
            builder.append("PrevWord[");
            builder.append(i);
            builder.append("]: ");
            if (wordInfo == null) {
                builder.append("null. ");
                continue;
            }
            if (!wordInfo.isValid()) {
                builder.append("Empty. ");
                continue;
            }
            builder.append(wordInfo.mWord);
            builder.append(", isBeginningOfSentence: ");
            builder.append(wordInfo.mIsBeginningOfSentence);
            builder.append(". ");
        }
        return builder.toString();
    }
}