1b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi/* 2b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project 3b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * 4b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * you may not use this file except in compliance with the License. 6b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * You may obtain a copy of the License at 7b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * 8b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * 10b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * See the License for the specific language governing permissions and 14b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * limitations under the License. 15b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi */ 16b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 17b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#ifndef LATINIME_PREV_WORDS_INFO_H 18b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#define LATINIME_PREV_WORDS_INFO_H 19b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 20b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#include "defines.h" 2145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" 2245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 2396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi#include "utils/char_utils.h" 24b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 25b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanaginamespace latinime { 26b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 27b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi// TODO: Support n-gram. 28b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagiclass PrevWordsInfo { 29b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi public: 30b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi // No prev word information. 31b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi PrevWordsInfo() { 32b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi clear(); 33b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 34b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 3505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) { 3605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 3705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i]; 3805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i], 3905b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); 4005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i]; 4105b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 4205b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 4305b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi 4405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi // Construct from previous words. 4505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH], 4605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, 4705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi const size_t prevWordCount) { 4805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi clear(); 4905b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) { 5005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { 5105b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi continue; 5205b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 5305b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], 5405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); 5505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; 5605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; 5705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 5805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 5905b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi 6005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi // Construct from a previous word. 61b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, 62b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi const bool isBeginningOfSentence) { 63b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi clear(); 6405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { 6505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi return; 6605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 6705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi memmove(mPrevWordCodePoints[0], prevWordCodePoints, 6805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); 69b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mPrevWordCodePointCount[0] = prevWordCodePointCount; 70b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mIsBeginningOfSentence[0] = isBeginningOfSentence; 71b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 7245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi 739f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi bool isValid() const { 7405b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi if (mPrevWordCodePointCount[0] > 0) { 7505b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi return true; 7605b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi } 7705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi if (mIsBeginningOfSentence[0]) { 7805b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi return true; 799f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 8005b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi return false; 819f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 829f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi 8345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi void getPrevWordsTerminalPtNodePos( 8445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 8596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { 8645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 8745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, 8845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi mPrevWordCodePoints[i], mPrevWordCodePointCount[i], 899f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi mIsBeginningOfSentence[i], tryLowerCaseSearch); 9045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 91b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi } 92b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 939f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // n is 1-indexed. 949f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi const int *getNthPrevWordCodePoints(const int n) const { 959f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 969f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return nullptr; 979f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 989f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return mPrevWordCodePoints[n - 1]; 999f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 1009f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi 1019f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // n is 1-indexed. 1029f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi int getNthPrevWordCodePointCount(const int n) const { 1039f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 1049f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return 0; 1059f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 1069f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return mPrevWordCodePointCount[n - 1]; 1079f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 1089f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi 10922931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi // n is 1-indexed. 11022931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi bool isNthPrevWordBeginningOfSentence(const int n) const { 11122931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 11222931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi return false; 11322931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi } 11422931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi return mIsBeginningOfSentence[n - 1]; 11522931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi } 11622931cd94155b5623b9fa52c0596a44aa89bf606Keisuke Kuroyanagi 117b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi private: 118b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo); 119b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 12045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi static int getTerminalPtNodePosOfWord( 12145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 12245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const int *const wordCodePoints, const int wordCodePointCount, 1239f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { 1247852765a503fe6dab54e1c4ab9e5e6b7bbdc70f2Keisuke Kuroyanagi if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { 12545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return NOT_A_DICT_POS; 12645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 12796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int codePoints[MAX_WORD_LENGTH]; 12896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int codePointCount = wordCodePointCount; 12996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); 13096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (isBeginningOfSentence) { 13196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, 13296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount, MAX_WORD_LENGTH); 13396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (codePointCount <= 0) { 13496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi return NOT_A_DICT_POS; 13596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 13696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 13745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( 13896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePoints, codePointCount, false /* forceLowerCaseSearch */); 1399f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { 1409f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // Return the position when when the word was found or doesn't try lower case 1419f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // search. 14245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return wordPtNodePos; 14345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 14445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi // Check bigrams for lower-cased previous word if original was not found. Useful for 14545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi // auto-capitalized words like "The [current_word]". 14645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return dictStructurePolicy->getTerminalPtNodePositionOfWord( 14796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePoints, codePointCount, true /* forceLowerCaseSearch */); 14896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 14996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi 150b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi void clear() { 151b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 152b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mPrevWordCodePointCount[i] = 0; 153b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mIsBeginningOfSentence[i] = false; 154b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 155b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 156b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi 15705b1e0d42f9f103516103d4d33e61862c0851e9dKeisuke Kuroyanagi int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; 158b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 159b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 160b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi}; 161b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi} // namespace latinime 162b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#endif // LATINIME_PREV_WORDS_INFO_H 163