1f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi/* 2f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project 3f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * 4f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * you may not use this file except in compliance with the License. 6f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * You may obtain a copy of the License at 7f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * 8f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * 10f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * See the License for the specific language governing permissions and 14f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * limitations under the License. 15f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi */ 16f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 1788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/ngram_context.h" 18f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 1988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" 20f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi#include "utils/char_utils.h" 21f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 22f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanaginamespace latinime { 23f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 24f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext() : mPrevWordCount(0) {} 25f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 26f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext(const NgramContext &ngramContext) 27f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi : mPrevWordCount(ngramContext.mPrevWordCount) { 28f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi for (size_t i = 0; i < mPrevWordCount; ++i) { 29f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i]; 30f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i], 31f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); 32f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i]; 33f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 34f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 35f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 36f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], 37f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, 38f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const size_t prevWordCount) 39f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) { 40f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi clear(); 41f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi for (size_t i = 0; i < mPrevWordCount; ++i) { 42f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { 43f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi continue; 44f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 45f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], 46f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); 47f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; 48f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; 49f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 50f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 51f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 52f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, 53f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const bool isBeginningOfSentence) : mPrevWordCount(1) { 54f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi clear(); 55f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { 56f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return; 57f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 58f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi memmove(mPrevWordCodePoints[0], prevWordCodePoints, 59f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); 60f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mPrevWordCodePointCount[0] = prevWordCodePointCount; 61f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mIsBeginningOfSentence[0] = isBeginningOfSentence; 62f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 63f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 64f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagibool NgramContext::isValid() const { 65f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (mPrevWordCodePointCount[0] > 0) { 66f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return true; 67f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 68f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (mIsBeginningOfSentence[0]) { 69f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return true; 70f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 71f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return false; 72f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 73f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 74f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagiconst CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const { 75f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (n <= 0 || n > mPrevWordCount) { 76f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return CodePointArrayView(); 77f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 78f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); 79f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 80f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 81f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagibool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const { 82f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (n <= 0 || n > mPrevWordCount) { 83f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return false; 84f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 85f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return mIsBeginningOfSentence[n - 1]; 86f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 87f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 88f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi/* static */ int NgramContext::getWordId( 89f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 90f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const int *const wordCodePoints, const int wordCodePointCount, 91f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { 92f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { 93f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return NOT_A_WORD_ID; 94f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 95f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi int codePoints[MAX_WORD_LENGTH]; 96f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi int codePointCount = wordCodePointCount; 97f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); 98f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (isBeginningOfSentence) { 99f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount, 100f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi MAX_WORD_LENGTH); 101f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (codePointCount <= 0) { 102f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return NOT_A_WORD_ID; 103f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 104f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 105f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const CodePointArrayView codePointArrayView(codePoints, codePointCount); 106f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi const int wordId = dictStructurePolicy->getWordId(codePointArrayView, 107f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi false /* forceLowerCaseSearch */); 108f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) { 109f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi // Return the id when when the word was found or doesn't try lower case search. 110f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return wordId; 111f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 112f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi // Check bigrams for lower-cased previous word if original was not found. Useful for 113f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi // auto-capitalized words like "The [current_word]". 114f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */); 115f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 116f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi 117f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagivoid NgramContext::clear() { 118f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 119f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mPrevWordCodePointCount[i] = 0; 120f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi mIsBeginningOfSentence[i] = false; 121f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi } 122f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} 123f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} // namespace latinime 124