1f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi/*
2f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project
3f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi *
4f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * you may not use this file except in compliance with the License.
6f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * You may obtain a copy of the License at
7f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi *
8f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi *      http://www.apache.org/licenses/LICENSE-2.0
9f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi *
10f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * See the License for the specific language governing permissions and
14f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi * limitations under the License.
15f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi */
16f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
1788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/ngram_context.h"
18f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
1988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
20f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi#include "utils/char_utils.h"
21f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
22f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanaginamespace latinime {
23f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
24f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext() : mPrevWordCount(0) {}
25f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
26f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext(const NgramContext &ngramContext)
27f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        : mPrevWordCount(ngramContext.mPrevWordCount) {
28f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    for (size_t i = 0; i < mPrevWordCount; ++i) {
29f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
30f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
31f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi                sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
32f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
33f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
34f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
35f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
36f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
37f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
38f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        const size_t prevWordCount)
39f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
40f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    clear();
41f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    for (size_t i = 0; i < mPrevWordCount; ++i) {
42f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
43f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi            continue;
44f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        }
45f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
46f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi                sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
47f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
48f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
49f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
50f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
51f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
52f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke KuroyanagiNgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
53f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        const bool isBeginningOfSentence) : mPrevWordCount(1) {
54f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    clear();
55f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
56f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return;
57f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
58f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    memmove(mPrevWordCodePoints[0], prevWordCodePoints,
59f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi            sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
60f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    mPrevWordCodePointCount[0] = prevWordCodePointCount;
61f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    mIsBeginningOfSentence[0] = isBeginningOfSentence;
62f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
63f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
64f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagibool NgramContext::isValid() const {
65f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (mPrevWordCodePointCount[0] > 0) {
66f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return true;
67f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
68f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (mIsBeginningOfSentence[0]) {
69f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return true;
70f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
71f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    return false;
72f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
73f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
74f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagiconst CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
75f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (n <= 0 || n > mPrevWordCount) {
76f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return CodePointArrayView();
77f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
78f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
79f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
80f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
81f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagibool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
82f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (n <= 0 || n > mPrevWordCount) {
83f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return false;
84f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
85f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    return mIsBeginningOfSentence[n - 1];
86f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
87f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
88f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi/* static */ int NgramContext::getWordId(
89f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
90f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        const int *const wordCodePoints, const int wordCodePointCount,
91f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
92f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
93f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return NOT_A_WORD_ID;
94f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
95f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    int codePoints[MAX_WORD_LENGTH];
96f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    int codePointCount = wordCodePointCount;
97f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
98f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (isBeginningOfSentence) {
99f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
100f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi                MAX_WORD_LENGTH);
101f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        if (codePointCount <= 0) {
102f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi            return NOT_A_WORD_ID;
103f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        }
104f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
105f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    const CodePointArrayView codePointArrayView(codePoints, codePointCount);
106f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
107f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi            false /* forceLowerCaseSearch */);
108f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
109f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        // Return the id when when the word was found or doesn't try lower case search.
110f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        return wordId;
111f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
112f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    // Check bigrams for lower-cased previous word if original was not found. Useful for
113f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    // auto-capitalized words like "The [current_word]".
114f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
115f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
116f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi
117f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagivoid NgramContext::clear() {
118f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
119f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        mPrevWordCodePointCount[i] = 0;
120f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi        mIsBeginningOfSentence[i] = false;
121f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi    }
122f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi}
123f87bb77a9183d126847d5925c2b03bec45fabd6dKeisuke Kuroyanagi} // namespace latinime
124