1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
18
19#include <cstdio>
20#include <cstring>
21#include <ctime>
22
23#include "defines.h"
24#include "suggest/core/dicnode/dic_node.h"
25#include "suggest/core/dicnode/dic_node_vector.h"
26#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
27#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
28#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
29#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
30#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
31#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
32#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
33
34namespace latinime {
35
36// Note that these are corresponding definitions in Java side in BinaryDictionaryTests and
37// BinaryDictionaryDecayingTests.
38const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
39const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
40const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
41const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
42const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
43        "SET_NEEDS_TO_DECAY_FOR_TESTING";
44const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
45const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
46        DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024;
47
48void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
49        DicNodeVector *const childDicNodes) const {
50    if (!dicNode->hasChildren()) {
51        return;
52    }
53    DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
54            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
55    readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos());
56    const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
57    while (!readingHelper.isEnd()) {
58        bool isTerminal = nodeReader->isTerminal() && !nodeReader->isDeleted();
59        if (isTerminal && mHeaderPolicy.isDecayingDict()) {
60            // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
61            // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
62            // valid terminal DicNode.
63            isTerminal = getProbability(nodeReader->getProbability(), NOT_A_PROBABILITY)
64                    != NOT_A_PROBABILITY;
65        }
66        childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(),
67                nodeReader->getChildrenPos(), nodeReader->getProbability(), isTerminal,
68                nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(),
69                nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints());
70        readingHelper.readNextSiblingNode();
71    }
72}
73
74int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
75        const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
76        int *const outUnigramProbability) const {
77    // This method traverses parent nodes from the terminal by following parent pointers; thus,
78    // node code points are stored in the buffer in the reverse order.
79    int reverseCodePoints[maxCodePointCount];
80    DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
81            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
82    // First, read the terminal node and get its probability.
83    readingHelper.initWithPtNodePos(ptNodePos);
84    if (!readingHelper.isValidTerminalNode()) {
85        // Node at the ptNodePos is not a valid terminal node.
86        *outUnigramProbability = NOT_A_PROBABILITY;
87        return 0;
88    }
89    // Store terminal node probability.
90    *outUnigramProbability = readingHelper.getNodeReader()->getProbability();
91    // Then, following parent node link to the dictionary root and fetch node code points.
92    while (!readingHelper.isEnd()) {
93        if (readingHelper.getTotalCodePointCount() > maxCodePointCount) {
94            // The ptNodePos is not a valid terminal node position in the dictionary.
95            *outUnigramProbability = NOT_A_PROBABILITY;
96            return 0;
97        }
98        // Store node code points to buffer in the reverse order.
99        readingHelper.fetchMergedNodeCodePointsInReverseOrder(
100                readingHelper.getPrevTotalCodePointCount(), reverseCodePoints);
101        // Follow parent node toward the root node.
102        readingHelper.readParentNode();
103    }
104    if (readingHelper.isError()) {
105        // The node position or the dictionary is invalid.
106        *outUnigramProbability = NOT_A_PROBABILITY;
107        return 0;
108    }
109    // Reverse the stored code points to output them.
110    const int codePointCount = readingHelper.getTotalCodePointCount();
111    for (int i = 0; i < codePointCount; ++i) {
112        outCodePoints[i] = reverseCodePoints[codePointCount - i - 1];
113    }
114    return codePointCount;
115}
116
117int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
118        const int length, const bool forceLowerCaseSearch) const {
119    int searchCodePoints[length];
120    for (int i = 0; i < length; ++i) {
121        searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
122    }
123    DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
124            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
125    readingHelper.initWithPtNodeArrayPos(getRootPosition());
126    const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
127    while (!readingHelper.isEnd()) {
128        const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount();
129        if (readingHelper.getTotalCodePointCount() > length
130                || !readingHelper.isMatchedCodePoint(0 /* index */,
131                        searchCodePoints[matchedCodePointCount])) {
132            // Current node has too many code points or its first code point is different from
133            // target code point. Skip this node and read the next sibling node.
134            readingHelper.readNextSiblingNode();
135            continue;
136        }
137        // Check following merged node code points.
138        const int nodeCodePointCount = nodeReader->getCodePointCount();
139        for (int j = 1; j < nodeCodePointCount; ++j) {
140            if (!readingHelper.isMatchedCodePoint(
141                    j, searchCodePoints[matchedCodePointCount + j])) {
142                // Different code point is found. The given word is not included in the dictionary.
143                return NOT_A_DICT_POS;
144            }
145        }
146        // All characters are matched.
147        if (length == readingHelper.getTotalCodePointCount()) {
148            // Terminal position is found.
149            return nodeReader->getHeadPos();
150        }
151        if (!nodeReader->hasChildren()) {
152            return NOT_A_DICT_POS;
153        }
154        // Advance to the children nodes.
155        readingHelper.readChildNode();
156    }
157    // If we already traversed the tree further than the word is long, there means
158    // there was no match (or we would have found it).
159    return NOT_A_DICT_POS;
160}
161
162int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
163        const int bigramProbability) const {
164    if (mHeaderPolicy.isDecayingDict()) {
165        return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
166    } else {
167        if (unigramProbability == NOT_A_PROBABILITY) {
168            return NOT_A_PROBABILITY;
169        } else if (bigramProbability == NOT_A_PROBABILITY) {
170            return ProbabilityUtils::backoff(unigramProbability);
171        } else {
172            return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
173                    bigramProbability);
174        }
175    }
176}
177
178int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const {
179    if (ptNodePos == NOT_A_DICT_POS) {
180        return NOT_A_PROBABILITY;
181    }
182    DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer,
183            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
184    nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
185    if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
186        return NOT_A_PROBABILITY;
187    }
188    return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY);
189}
190
191int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
192    if (ptNodePos == NOT_A_DICT_POS) {
193        return NOT_A_DICT_POS;
194    }
195    DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer,
196            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
197    nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
198    if (nodeReader.isDeleted()) {
199        return NOT_A_DICT_POS;
200    }
201    return nodeReader.getShortcutPos();
202}
203
204int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
205    if (ptNodePos == NOT_A_DICT_POS) {
206        return NOT_A_DICT_POS;
207    }
208    DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer,
209            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
210    nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
211    if (nodeReader.isDeleted()) {
212        return NOT_A_DICT_POS;
213    }
214    return nodeReader.getBigramsPos();
215}
216
217bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
218        const int probability) {
219    if (!mBuffer->isUpdatable()) {
220        AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
221        return false;
222    }
223    if (mBufferWithExtendableBuffer.getTailPosition()
224            >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
225        AKLOGE("The dictionary is too large to dynamically update.");
226        return false;
227    }
228    DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
229            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
230    readingHelper.initWithPtNodeArrayPos(getRootPosition());
231    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
232            &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
233    bool addedNewUnigram = false;
234    if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
235            &addedNewUnigram)) {
236        if (addedNewUnigram) {
237            mUnigramCount++;
238        }
239        return true;
240    } else {
241        return false;
242    }
243}
244
245bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
246        const int *const word1, const int length1, const int probability) {
247    if (!mBuffer->isUpdatable()) {
248        AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
249        return false;
250    }
251    if (mBufferWithExtendableBuffer.getTailPosition()
252            >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
253        AKLOGE("The dictionary is too large to dynamically update.");
254        return false;
255    }
256    const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
257            false /* forceLowerCaseSearch */);
258    if (word0Pos == NOT_A_DICT_POS) {
259        return false;
260    }
261    const int word1Pos = getTerminalNodePositionOfWord(word1, length1,
262            false /* forceLowerCaseSearch */);
263    if (word1Pos == NOT_A_DICT_POS) {
264        return false;
265    }
266    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
267            &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
268    bool addedNewBigram = false;
269    if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
270        if (addedNewBigram) {
271            mBigramCount++;
272        }
273        return true;
274    } else {
275        return false;
276    }
277}
278
279bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
280        const int *const word1, const int length1) {
281    if (!mBuffer->isUpdatable()) {
282        AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
283        return false;
284    }
285    if (mBufferWithExtendableBuffer.getTailPosition()
286            >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
287        AKLOGE("The dictionary is too large to dynamically update.");
288        return false;
289    }
290    const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
291            false /* forceLowerCaseSearch */);
292    if (word0Pos == NOT_A_DICT_POS) {
293        return false;
294    }
295    const int word1Pos = getTerminalNodePositionOfWord(word1, length1,
296            false /* forceLowerCaseSearch */);
297    if (word1Pos == NOT_A_DICT_POS) {
298        return false;
299    }
300    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
301            &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
302    if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
303        mBigramCount--;
304        return true;
305    } else {
306        return false;
307    }
308}
309
310void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
311    if (!mBuffer->isUpdatable()) {
312        AKLOGI("Warning: flush() is called for non-updatable dictionary.");
313        return;
314    }
315    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
316            &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */);
317    writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
318}
319
320void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
321    if (!mBuffer->isUpdatable()) {
322        AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
323        return;
324    }
325    const bool needsToDecay = mHeaderPolicy.isDecayingDict()
326            && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
327                    false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
328    DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer,
329            &mShortcutListPolicy, needsToDecay);
330    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
331            &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay);
332    writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy);
333    mNeedsToDecayForTesting = false;
334}
335
336bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
337    if (!mBuffer->isUpdatable()) {
338        AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
339        return false;
340    }
341    if (mBufferWithExtendableBuffer.isNearSizeLimit()) {
342        // Additional buffer size is near the limit.
343        return true;
344    } else if (mHeaderPolicy.getExtendedRegionSize()
345            + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize()
346                    > MAX_DICT_EXTENDED_REGION_SIZE) {
347        // Total extended region size exceeds the limit.
348        return true;
349    } else if (mBufferWithExtendableBuffer.getTailPosition()
350            >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
351                    && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) {
352        // Needs to reduce dictionary size.
353        return true;
354    } else if (mHeaderPolicy.isDecayingDict()) {
355        return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
356                mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy);
357    }
358    return false;
359}
360
361void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
362        const int maxResultLength) {
363    if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
364        snprintf(outResult, maxResultLength, "%d", mUnigramCount);
365    } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
366        snprintf(outResult, maxResultLength, "%d", mBigramCount);
367    } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
368        snprintf(outResult, maxResultLength, "%d",
369                mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
370                        static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE));
371    } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
372        snprintf(outResult, maxResultLength, "%d",
373                mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
374                        static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE));
375    } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) {
376        mNeedsToDecayForTesting = true;
377    }
378}
379
380} // namespace latinime
381