1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "dictionary/property/ngram_context.h" 18 19#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" 20#include "utils/char_utils.h" 21 22namespace latinime { 23 24NgramContext::NgramContext() : mPrevWordCount(0) {} 25 26NgramContext::NgramContext(const NgramContext &ngramContext) 27 : mPrevWordCount(ngramContext.mPrevWordCount) { 28 for (size_t i = 0; i < mPrevWordCount; ++i) { 29 mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i]; 30 memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i], 31 sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); 32 mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i]; 33 } 34} 35 36NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], 37 const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, 38 const size_t prevWordCount) 39 : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) { 40 clear(); 41 for (size_t i = 0; i < mPrevWordCount; ++i) { 42 if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { 43 continue; 44 } 45 memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], 46 sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); 47 mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; 48 mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; 49 } 50} 51 52NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, 53 const bool isBeginningOfSentence) : mPrevWordCount(1) { 54 clear(); 55 if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { 56 return; 57 } 58 memmove(mPrevWordCodePoints[0], prevWordCodePoints, 59 sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); 60 mPrevWordCodePointCount[0] = prevWordCodePointCount; 61 mIsBeginningOfSentence[0] = isBeginningOfSentence; 62} 63 64bool NgramContext::isValid() const { 65 if (mPrevWordCodePointCount[0] > 0) { 66 return true; 67 } 68 if (mIsBeginningOfSentence[0]) { 69 return true; 70 } 71 return false; 72} 73 74const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const { 75 if (n <= 0 || n > mPrevWordCount) { 76 return CodePointArrayView(); 77 } 78 return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); 79} 80 81bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const { 82 if (n <= 0 || n > mPrevWordCount) { 83 return false; 84 } 85 return mIsBeginningOfSentence[n - 1]; 86} 87 88/* static */ int NgramContext::getWordId( 89 const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 90 const int *const wordCodePoints, const int wordCodePointCount, 91 const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { 92 if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { 93 return NOT_A_WORD_ID; 94 } 95 int codePoints[MAX_WORD_LENGTH]; 96 int codePointCount = wordCodePointCount; 97 memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); 98 if (isBeginningOfSentence) { 99 codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount, 100 MAX_WORD_LENGTH); 101 if (codePointCount <= 0) { 102 return NOT_A_WORD_ID; 103 } 104 } 105 const CodePointArrayView codePointArrayView(codePoints, codePointCount); 106 const int wordId = dictStructurePolicy->getWordId(codePointArrayView, 107 false /* forceLowerCaseSearch */); 108 if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) { 109 // Return the id when when the word was found or doesn't try lower case search. 110 return wordId; 111 } 112 // Check bigrams for lower-cased previous word if original was not found. Useful for 113 // auto-capitalized words like "The [current_word]". 114 return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */); 115} 116 117void NgramContext::clear() { 118 for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 119 mPrevWordCodePointCount[i] = 0; 120 mIsBeginningOfSentence[i] = false; 121 } 122} 123} // namespace latinime 124