1b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi/*
2b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project
3b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *
4b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * you may not use this file except in compliance with the License.
6b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * You may obtain a copy of the License at
7b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *
8b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *      http://www.apache.org/licenses/LICENSE-2.0
9b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi *
10b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * See the License for the specific language governing permissions and
14b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * limitations under the License.
15b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi */
16b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
17b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#ifndef LATINIME_PREV_WORDS_INFO_H
18b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#define LATINIME_PREV_WORDS_INFO_H
19b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
20b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#include "defines.h"
2145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
2245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
2396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi#include "utils/char_utils.h"
24b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
25b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanaginamespace latinime {
26b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
27b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi// TODO: Support n-gram.
28b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagiclass PrevWordsInfo {
29b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi public:
30b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    // No prev word information.
31b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    PrevWordsInfo() {
32b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        clear();
33b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    }
34b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
3505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) {
3605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
3705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i];
3805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i],
3905b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi                    sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
4005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i];
4105b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        }
4205b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    }
4305b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi
4405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    // Construct from previous words.
4505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH],
4605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
4705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            const size_t prevWordCount) {
4805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        clear();
4905b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) {
5005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
5105b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi                continue;
5205b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            }
5305b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
5405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi                    sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
5505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
5605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
5705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        }
5805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    }
5905b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi
6005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    // Construct from a previous word.
61b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount,
62b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            const bool isBeginningOfSentence) {
63b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        clear();
6405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
6505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            return;
6605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        }
6705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        memmove(mPrevWordCodePoints[0], prevWordCodePoints,
6805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi                sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
69b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        mPrevWordCodePointCount[0] = prevWordCodePointCount;
70b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        mIsBeginningOfSentence[0] = isBeginningOfSentence;
71b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    }
7245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi
739f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    bool isValid() const {
7405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        if (mPrevWordCodePointCount[0] > 0) {
7505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            return true;
7605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        }
7705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        if (mIsBeginningOfSentence[0]) {
7805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi            return true;
799f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        }
8005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi        return false;
819f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    }
829f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi
8345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    void getPrevWordsTerminalPtNodePos(
8445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
8596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
8645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
8745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
8845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi                    mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
899f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi                    mIsBeginningOfSentence[i], tryLowerCaseSearch);
9045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        }
91b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    }
92b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
939f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    // n is 1-indexed.
949f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    const int *getNthPrevWordCodePoints(const int n) const {
959f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
969f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            return nullptr;
979f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        }
989f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        return mPrevWordCodePoints[n - 1];
999f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    }
1009f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi
1019f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    // n is 1-indexed.
1029f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    int getNthPrevWordCodePointCount(const int n) const {
1039f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
1049f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            return 0;
1059f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        }
1069f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        return mPrevWordCodePointCount[n - 1];
1079f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    }
1089f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi
10922931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi    // n is 1-indexed.
11022931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi    bool isNthPrevWordBeginningOfSentence(const int n) const {
11122931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi        if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
11222931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi            return false;
11322931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi        }
11422931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi        return mIsBeginningOfSentence[n - 1];
11522931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi    }
11622931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi
117b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi private:
118b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi    DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
119b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi
12045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi    static int getTerminalPtNodePosOfWord(
12145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
12245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            const int *const wordCodePoints, const int wordCodePointCount,
1239f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
1247852765a503fe6dab54e1c4ab9e5e6b7bbdc70f2Keisuke Kuroyanagi        if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
12545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            return NOT_A_DICT_POS;
12645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        }
12796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int codePoints[MAX_WORD_LENGTH];
12896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        int codePointCount = wordCodePointCount;
12996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
13096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        if (isBeginningOfSentence) {
13196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
13296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                    codePointCount, MAX_WORD_LENGTH);
13396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            if (codePointCount <= 0) {
13496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                return NOT_A_DICT_POS;
13596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi            }
13696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi        }
13745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
13896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                codePoints, codePointCount, false /* forceLowerCaseSearch */);
1399f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
1409f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            // Return the position when when the word was found or doesn't try lower case
1419f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi            // search.
14245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi            return wordPtNodePos;
14345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        }
14445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        // Check bigrams for lower-cased previous word if original was not found. Useful for
14545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        // auto-capitalized words like "The [current_word]".
14645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi        return dictStructurePolicy->getTerminalPtNodePositionOfWord(
14796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi                codePoints, codePointCount, true /* forceLowerCaseSearch */);
14896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi    }
14996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi
150b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    void clear() {
151b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
152b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            mPrevWordCodePointCount[i] = 0;
153b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi            mIsBeginningOfSentence[i] = false;
154b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi        }
155b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    }
156b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi
15705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi    int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
158b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
159b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi    bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
160b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi};
161b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi} // namespace latinime
162b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#endif // LATINIME_PREV_WORDS_INFO_H
163