1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "dictionary/property/ngram_context.h"
18
19#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
20#include "utils/char_utils.h"
21
22namespace latinime {
23
24NgramContext::NgramContext() : mPrevWordCount(0) {}
25
26NgramContext::NgramContext(const NgramContext &ngramContext)
27        : mPrevWordCount(ngramContext.mPrevWordCount) {
28    for (size_t i = 0; i < mPrevWordCount; ++i) {
29        mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
30        memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
31                sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
32        mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
33    }
34}
35
36NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
37        const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
38        const size_t prevWordCount)
39        : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
40    clear();
41    for (size_t i = 0; i < mPrevWordCount; ++i) {
42        if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
43            continue;
44        }
45        memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
46                sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
47        mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
48        mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
49    }
50}
51
52NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
53        const bool isBeginningOfSentence) : mPrevWordCount(1) {
54    clear();
55    if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
56        return;
57    }
58    memmove(mPrevWordCodePoints[0], prevWordCodePoints,
59            sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
60    mPrevWordCodePointCount[0] = prevWordCodePointCount;
61    mIsBeginningOfSentence[0] = isBeginningOfSentence;
62}
63
64bool NgramContext::isValid() const {
65    if (mPrevWordCodePointCount[0] > 0) {
66        return true;
67    }
68    if (mIsBeginningOfSentence[0]) {
69        return true;
70    }
71    return false;
72}
73
74const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
75    if (n <= 0 || n > mPrevWordCount) {
76        return CodePointArrayView();
77    }
78    return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
79}
80
81bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
82    if (n <= 0 || n > mPrevWordCount) {
83        return false;
84    }
85    return mIsBeginningOfSentence[n - 1];
86}
87
88/* static */ int NgramContext::getWordId(
89        const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
90        const int *const wordCodePoints, const int wordCodePointCount,
91        const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
92    if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
93        return NOT_A_WORD_ID;
94    }
95    int codePoints[MAX_WORD_LENGTH];
96    int codePointCount = wordCodePointCount;
97    memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
98    if (isBeginningOfSentence) {
99        codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
100                MAX_WORD_LENGTH);
101        if (codePointCount <= 0) {
102            return NOT_A_WORD_ID;
103        }
104    }
105    const CodePointArrayView codePointArrayView(codePoints, codePointCount);
106    const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
107            false /* forceLowerCaseSearch */);
108    if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
109        // Return the id when when the word was found or doesn't try lower case search.
110        return wordId;
111    }
112    // Check bigrams for lower-cased previous word if original was not found. Useful for
113    // auto-capitalized words like "The [current_word]".
114    return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
115}
116
117void NgramContext::clear() {
118    for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
119        mPrevWordCodePointCount[i] = 0;
120        mIsBeginningOfSentence[i] = false;
121    }
122}
123} // namespace latinime
124