1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.utils;
18
19import com.android.inputmethod.latin.NgramContext;
20import com.android.inputmethod.latin.NgramContext.WordInfo;
21import com.android.inputmethod.latin.define.DecoderSpecificConstants;
22import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
23
24import java.util.Arrays;
25import java.util.regex.Pattern;
26
27import javax.annotation.Nonnull;
28
29public final class NgramContextUtils {
30    private NgramContextUtils() {
31        // Intentional empty constructor for utility class.
32    }
33
34    private static final Pattern NEWLINE_REGEX = Pattern.compile("[\\r\\n]+");
35    private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");
36    // Get context information from nth word before the cursor. n = 1 retrieves the words
37    // immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits
38    // on whitespace only.
39    // Also, it won't return words that end in a separator (if the nth word before the cursor
40    // ends in a separator, it returns information representing beginning-of-sentence).
41    // Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):
42    // (n = 1) "abc def|" -> abc, def
43    // (n = 1) "abc def |" -> abc, def
44    // (n = 1) "abc 'def|" -> empty, 'def
45    // (n = 1) "abc def. |" -> beginning-of-sentence
46    // (n = 1) "abc def . |" -> beginning-of-sentence
47    // (n = 2) "abc def|" -> beginning-of-sentence, abc
48    // (n = 2) "abc def |" -> beginning-of-sentence, abc
49    // (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot
50    // represent this situation using NgramContext. See TODO in the method.
51    // TODO: The next example's result should be "abc, def". This have to be fixed before we
52    // retrieve the prior context of Beginning-of-Sentence.
53    // (n = 2) "abc def. |" -> beginning-of-sentence, abc
54    // (n = 2) "abc def . |" -> abc, def
55    // (n = 2) "abc|" -> beginning-of-sentence
56    // (n = 2) "abc |" -> beginning-of-sentence
57    // (n = 2) "abc. def|" -> beginning-of-sentence
58    @Nonnull
59    public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev,
60            final SpacingAndPunctuations spacingAndPunctuations, final int n) {
61        if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO;
62        final String[] lines = NEWLINE_REGEX.split(prev);
63        if (lines.length == 0) {
64            return new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO);
65        }
66        final String[] w = SPACE_REGEX.split(lines[lines.length - 1]);
67        final WordInfo[] prevWordsInfo =
68                new WordInfo[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
69        Arrays.fill(prevWordsInfo, WordInfo.EMPTY_WORD_INFO);
70        for (int i = 0; i < prevWordsInfo.length; i++) {
71            final int focusedWordIndex = w.length - n - i;
72            // Referring to the word after the focused word.
73            if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {
74                final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];
75                if (!wordFollowingTheNthPrevWord.isEmpty()) {
76                    final char firstChar = wordFollowingTheNthPrevWord.charAt(0);
77                    if (spacingAndPunctuations.isWordConnector(firstChar)) {
78                        // The word following the focused word is starting with a word connector.
79                        // TODO: Return meaningful context for this case.
80                        break;
81                    }
82                }
83            }
84            // If we can't find (n + i) words, the context is beginning-of-sentence.
85            if (focusedWordIndex < 0) {
86                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;
87                break;
88            }
89
90            final String focusedWord = w[focusedWordIndex];
91            // If the word is empty, the context is beginning-of-sentence.
92            final int length = focusedWord.length();
93            if (length <= 0) {
94                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;
95                break;
96            }
97            // If the word ends in a sentence terminator, the context is beginning-of-sentence.
98            final char lastChar = focusedWord.charAt(length - 1);
99            if (spacingAndPunctuations.isSentenceTerminator(lastChar)) {
100                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;
101                break;
102            }
103            // If ends in a word separator or connector, the context is unclear.
104            // TODO: Return meaningful context for this case.
105            if (spacingAndPunctuations.isWordSeparator(lastChar)
106                    || spacingAndPunctuations.isWordConnector(lastChar)) {
107                break;
108            }
109            prevWordsInfo[i] = new WordInfo(focusedWord);
110        }
111        return new NgramContext(prevWordsInfo);
112    }
113}
114