1bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi/*
2bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project
3bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi *
4bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * you may not use this file except in compliance with the License.
6bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * You may obtain a copy of the License at
7bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi *
8bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi *      http://www.apache.org/licenses/LICENSE-2.0
9bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi *
10bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * See the License for the specific language governing permissions and
14bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi * limitations under the License.
15bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi */
16bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi
17bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagipackage com.android.inputmethod.latin.utils;
18bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi
19bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagiimport java.util.regex.Pattern;
20bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi
21bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagiimport com.android.inputmethod.latin.Constants;
22bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagiimport com.android.inputmethod.latin.PrevWordsInfo;
23bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagiimport com.android.inputmethod.latin.PrevWordsInfo.WordInfo;
24bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagiimport com.android.inputmethod.latin.settings.SpacingAndPunctuations;
25bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi
26bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagipublic final class PrevWordsInfoUtils {
27bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    private PrevWordsInfoUtils() {
28bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        // Intentional empty constructor for utility class.
29bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    }
30bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi
31bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");
32bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // Get context information from nth word before the cursor. n = 1 retrieves the words
33bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits
34bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // on whitespace only.
35bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // Also, it won't return words that end in a separator (if the nth word before the cursor
36bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // ends in a separator, it returns information representing beginning-of-sentence).
37bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):
38bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 1) "abc def|" -> abc, def
39bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 1) "abc def |" -> abc, def
40bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 1) "abc 'def|" -> empty, 'def
41bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 1) "abc def. |" -> beginning-of-sentence
42bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 1) "abc def . |" -> beginning-of-sentence
43bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc def|" -> beginning-of-sentence, abc
44bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc def |" -> beginning-of-sentence, abc
45bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot
46bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // represent this situation using PrevWordsInfo. See TODO in the method.
47bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // TODO: The next example's result should be "abc, def". This have to be fixed before we
48bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // retrieve the prior context of Beginning-of-Sentence.
49bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc def. |" -> beginning-of-sentence, abc
50bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc def . |" -> abc, def
51bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc|" -> beginning-of-sentence
52bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc |" -> beginning-of-sentence
53bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    // (n = 2) "abc. def|" -> beginning-of-sentence
54bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    public static PrevWordsInfo getPrevWordsInfoFromNthPreviousWord(final CharSequence prev,
55bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            final SpacingAndPunctuations spacingAndPunctuations, final int n) {
56bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        if (prev == null) return PrevWordsInfo.EMPTY_PREV_WORDS_INFO;
57bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        final String[] w = SPACE_REGEX.split(prev);
58bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        final WordInfo[] prevWordsInfo = new WordInfo[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
59bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        for (int i = 0; i < prevWordsInfo.length; i++) {
60bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            final int focusedWordIndex = w.length - n - i;
61bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            // Referring to the word after the focused word.
62bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {
63bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];
64bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                if (!wordFollowingTheNthPrevWord.isEmpty()) {
65bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                    final char firstChar = wordFollowingTheNthPrevWord.charAt(0);
66bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                    if (spacingAndPunctuations.isWordConnector(firstChar)) {
67bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                        // The word following the focused word is starting with a word connector.
68bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                        // TODO: Return meaningful context for this case.
69bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                        prevWordsInfo[i] = WordInfo.EMPTY_WORD_INFO;
70bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                        break;
71bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                    }
72bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                }
73bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            }
74bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            // If we can't find (n + i) words, the context is beginning-of-sentence.
75bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            if (focusedWordIndex < 0) {
76bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
77bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                break;
78bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            }
79bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            final String focusedWord = w[focusedWordIndex];
80bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            // If the word is, the context is beginning-of-sentence.
81bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            final int length = focusedWord.length();
82bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            if (length <= 0) {
83bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
84bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                break;
85bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            }
86bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            // If ends in a sentence separator, the context is beginning-of-sentence.
87bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            final char lastChar = focusedWord.charAt(length - 1);
88bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            if (spacingAndPunctuations.isSentenceSeparator(lastChar)) {
89bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
90bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                break;
91bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            }
92bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            // If ends in a word separator or connector, the context is unclear.
93bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            // TODO: Return meaningful context for this case.
94bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            if (spacingAndPunctuations.isWordSeparator(lastChar)
95bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                    || spacingAndPunctuations.isWordConnector(lastChar)) {
96bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                prevWordsInfo[i] = WordInfo.EMPTY_WORD_INFO;
97bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi                break;
98bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            }
99bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi            prevWordsInfo[i] = new WordInfo(focusedWord);
100bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        }
101bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi        return new PrevWordsInfo(prevWordsInfo);
102bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi    }
103bb843eb223ce0f8fb1088ed3393a4165123ddb1fKeisuke Kuroyanagi}
104