prev_words_info.h revision 96990ca77357c8c3c518f71e2d9d8cfc62b2ee88
1b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi/*
2b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project
3b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *
4b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * you may not use this file except in compliance with the License.
6b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * You may obtain a copy of the License at
7b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *
8b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *      http://www.apache.org/licenses/LICENSE-2.0
9b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *
10b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * See the License for the specific language governing permissions and
14b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * limitations under the License.
15b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi */
16b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
17b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#ifndef LATINIME_PREV_WORDS_INFO_H
18b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#define LATINIME_PREV_WORDS_INFO_H
19b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
20b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#include "defines.h"
2145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
2245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
2396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi#include "utils/char_utils.h"
24b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
25b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanaginamespace latinime {
26b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
27b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi// TODO: Support n-gram.
28b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi// This class does not take ownership of any code point buffers.
29b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagiclass PrevWordsInfo {
30b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi public:
31b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    // No prev word information.
32b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    PrevWordsInfo() {
33b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        clear();
34b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    }
35b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
36b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount,
37b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            const bool isBeginningOfSentence) {
38b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        clear();
39b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        mPrevWordCodePoints[0] = prevWordCodePoints;
40b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        mPrevWordCodePointCount[0] = prevWordCodePointCount;
41b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        mIsBeginningOfSentence[0] = isBeginningOfSentence;
42b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    }
4345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi
449f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    bool isValid() const {
459f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
469f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            if (mPrevWordCodePointCount[i] > MAX_WORD_LENGTH) {
479f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi                return false;
489f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            }
499f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        }
509f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        return true;
519f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    }
529f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi
5345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    void getPrevWordsTerminalPtNodePos(
5445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
5596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
5645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
5745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
5845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi                    mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
599f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi                    mIsBeginningOfSentence[i], tryLowerCaseSearch);
6045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        }
61b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    }
62b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
6345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
6445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
6596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch(
6696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
6796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                mIsBeginningOfSentence[0]);
6896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(),
6996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                bigramListPos);
70b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    }
71b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
729f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    // n is 1-indexed.
739f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    const int *getNthPrevWordCodePoints(const int n) const {
749f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
759f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            return nullptr;
769f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        }
779f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        return mPrevWordCodePoints[n - 1];
789f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    }
799f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi
809f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    // n is 1-indexed.
819f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    int getNthPrevWordCodePointCount(const int n) const {
829f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
839f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            return 0;
849f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        }
859f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        return mPrevWordCodePointCount[n - 1];
869f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    }
879f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi
88b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi private:
89b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
90b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
9145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    static int getTerminalPtNodePosOfWord(
9245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
9345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const int *const wordCodePoints, const int wordCodePointCount,
949f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
9545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        if (!dictStructurePolicy || !wordCodePoints) {
9645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            return NOT_A_DICT_POS;
9745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        }
9896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int codePoints[MAX_WORD_LENGTH];
9996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int codePointCount = wordCodePointCount;
10096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
10196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        if (isBeginningOfSentence) {
10296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
10396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                    codePointCount, MAX_WORD_LENGTH);
10496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            if (codePointCount <= 0) {
10596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                return NOT_A_DICT_POS;
10696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            }
10796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        }
10845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
10996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                codePoints, codePointCount, false /* forceLowerCaseSearch */);
1109f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
1119f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            // Return the position when when the word was found or doesn't try lower case
1129f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            // search.
11345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            return wordPtNodePos;
11445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        }
11545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        // Check bigrams for lower-cased previous word if original was not found. Useful for
11645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        // auto-capitalized words like "The [current_word]".
11745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        return dictStructurePolicy->getTerminalPtNodePositionOfWord(
11896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                codePoints, codePointCount, true /* forceLowerCaseSearch */);
11996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi    }
12096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi
12196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi    static int getBigramListPositionForWordWithTryingLowerCaseSearch(
12296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
12396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            const int *const wordCodePoints, const int wordCodePointCount,
12496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            const bool isBeginningOfSentence) {
12596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int codePoints[MAX_WORD_LENGTH];
12696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int codePointCount = wordCodePointCount;
12796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
12896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        if (isBeginningOfSentence) {
12996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
13096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                    codePointCount, MAX_WORD_LENGTH);
13196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            if (codePointCount <= 0) {
13296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                return NOT_A_DICT_POS;
13396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            }
13496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        }
13596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
13696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                codePointCount, false /* forceLowerCaseSearch */);
13796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
13896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        // dictionary or has no bigrams
13996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        if (NOT_A_DICT_POS == pos) {
14096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            // If no bigrams for this exact word, search again in lower case.
14196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
14296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                    codePointCount, true /* forceLowerCaseSearch */);
14396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        }
14496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        return pos;
14545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    }
14645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi
14745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    static int getBigramListPositionForWord(
14845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
14945d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const int *wordCodePoints, const int wordCodePointCount,
15045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const bool forceLowerCaseSearch) {
15145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        if (!wordCodePoints || wordCodePointCount <= 0) return NOT_A_DICT_POS;
15245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        const int terminalPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
15345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi                wordCodePoints, wordCodePointCount, forceLowerCaseSearch);
15445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        if (NOT_A_DICT_POS == terminalPtNodePos) return NOT_A_DICT_POS;
15545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        return dictStructurePolicy->getBigramsPositionOfPtNode(terminalPtNodePos);
15645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    }
15745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi
158b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    void clear() {
159b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
160b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            mPrevWordCodePoints[i] = nullptr;
161b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            mPrevWordCodePointCount[i] = 0;
162b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            mIsBeginningOfSentence[i] = false;
163b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        }
164b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    }
165b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi
166b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    const int *mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
167b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
168b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
169b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi};
170b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi} // namespace latinime
171b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#endif // LATINIME_PREV_WORDS_INFO_H
172