prev_words_info.h revision 96990ca77357c8c3c518f71e2d9d8cfc62b2ee88
1b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi/* 2b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project 3b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * 4b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * you may not use this file except in compliance with the License. 6b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * You may obtain a copy of the License at 7b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * 8b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * 10b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * See the License for the specific language governing permissions and 14b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi * limitations under the License. 15b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi */ 16b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 17b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#ifndef LATINIME_PREV_WORDS_INFO_H 18b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#define LATINIME_PREV_WORDS_INFO_H 19b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 20b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#include "defines.h" 2145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" 2245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 2396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi#include "utils/char_utils.h" 24b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 25b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanaginamespace latinime { 26b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 27b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi// TODO: Support n-gram. 28b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi// This class does not take ownership of any code point buffers. 29b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagiclass PrevWordsInfo { 30b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi public: 31b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi // No prev word information. 32b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi PrevWordsInfo() { 33b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi clear(); 34b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 35b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 36b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, 37b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi const bool isBeginningOfSentence) { 38b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi clear(); 39b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mPrevWordCodePoints[0] = prevWordCodePoints; 40b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mPrevWordCodePointCount[0] = prevWordCodePointCount; 41b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mIsBeginningOfSentence[0] = isBeginningOfSentence; 42b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 4345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi 449f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi bool isValid() const { 459f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 469f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (mPrevWordCodePointCount[i] > MAX_WORD_LENGTH) { 479f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return false; 489f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 499f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 509f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return true; 519f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 529f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi 5345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi void getPrevWordsTerminalPtNodePos( 5445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 5596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { 5645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 5745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, 5845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi mPrevWordCodePoints[i], mPrevWordCodePointCount[i], 599f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi mIsBeginningOfSentence[i], tryLowerCaseSearch); 6045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 61b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi } 62b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 6345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction( 6445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const { 6596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch( 6696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0], 6796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi mIsBeginningOfSentence[0]); 6896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(), 6996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi bigramListPos); 70b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi } 71b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 729f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // n is 1-indexed. 739f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi const int *getNthPrevWordCodePoints(const int n) const { 749f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 759f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return nullptr; 769f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 779f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return mPrevWordCodePoints[n - 1]; 789f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 799f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi 809f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // n is 1-indexed. 819f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi int getNthPrevWordCodePointCount(const int n) const { 829f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 839f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return 0; 849f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 859f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi return mPrevWordCodePointCount[n - 1]; 869f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi } 879f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi 88b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi private: 89b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo); 90b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi 9145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi static int getTerminalPtNodePosOfWord( 9245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 9345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const int *const wordCodePoints, const int wordCodePointCount, 949f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { 9545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi if (!dictStructurePolicy || !wordCodePoints) { 9645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return NOT_A_DICT_POS; 9745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 9896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int codePoints[MAX_WORD_LENGTH]; 9996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int codePointCount = wordCodePointCount; 10096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); 10196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (isBeginningOfSentence) { 10296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, 10396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount, MAX_WORD_LENGTH); 10496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (codePointCount <= 0) { 10596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi return NOT_A_DICT_POS; 10696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 10796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 10845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( 10996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePoints, codePointCount, false /* forceLowerCaseSearch */); 1109f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { 1119f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // Return the position when when the word was found or doesn't try lower case 1129f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi // search. 11345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return wordPtNodePos; 11445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 11545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi // Check bigrams for lower-cased previous word if original was not found. Useful for 11645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi // auto-capitalized words like "The [current_word]". 11745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return dictStructurePolicy->getTerminalPtNodePositionOfWord( 11896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePoints, codePointCount, true /* forceLowerCaseSearch */); 11996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 12096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi 12196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi static int getBigramListPositionForWordWithTryingLowerCaseSearch( 12296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 12396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi const int *const wordCodePoints, const int wordCodePointCount, 12496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi const bool isBeginningOfSentence) { 12596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int codePoints[MAX_WORD_LENGTH]; 12696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int codePointCount = wordCodePointCount; 12796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); 12896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (isBeginningOfSentence) { 12996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, 13096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount, MAX_WORD_LENGTH); 13196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (codePointCount <= 0) { 13296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi return NOT_A_DICT_POS; 13396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 13496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 13596990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, 13696990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount, false /* forceLowerCaseSearch */); 13796990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the 13896990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi // dictionary or has no bigrams 13996990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi if (NOT_A_DICT_POS == pos) { 14096990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi // If no bigrams for this exact word, search again in lower case. 14196990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, 14296990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi codePointCount, true /* forceLowerCaseSearch */); 14396990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi } 14496990ca77357c8c3c518f71e2d9d8cfc62b2ee88Keisuke Kuroyanagi return pos; 14545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 14645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi 14745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi static int getBigramListPositionForWord( 14845d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 14945d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const int *wordCodePoints, const int wordCodePointCount, 15045d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const bool forceLowerCaseSearch) { 15145d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi if (!wordCodePoints || wordCodePointCount <= 0) return NOT_A_DICT_POS; 15245d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi const int terminalPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( 15345d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi wordCodePoints, wordCodePointCount, forceLowerCaseSearch); 15445d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi if (NOT_A_DICT_POS == terminalPtNodePos) return NOT_A_DICT_POS; 15545d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi return dictStructurePolicy->getBigramsPositionOfPtNode(terminalPtNodePos); 15645d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi } 15745d1a936a7a318286c4404951db1bd825e25cc7cKeisuke Kuroyanagi 158b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi void clear() { 159b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 160b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mPrevWordCodePoints[i] = nullptr; 161b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mPrevWordCodePointCount[i] = 0; 162b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi mIsBeginningOfSentence[i] = false; 163b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 164b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi } 165b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi 166b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi const int *mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 167b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 168b94ec1437b624a45ad5c0fde2dd385116e5e1163Keisuke Kuroyanagi bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 169b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi}; 170b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi} // namespace latinime 171b87fffb8be3dc6a79e49890a7700704d7fee616bKeisuke Kuroyanagi#endif // LATINIME_PREV_WORDS_INFO_H 172