1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
19 * Do not edit this file other than updating policy's interface.
20 *
21 * This file was generated from
22 *   dictionary/structure/v4/ver4_patricia_trie_policy.cpp
23 */
24
25#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
26
27#include <vector>
28
29#include "suggest/core/dicnode/dic_node.h"
30#include "suggest/core/dicnode/dic_node_vector.h"
31#include "dictionary/interface/ngram_listener.h"
32#include "dictionary/property/ngram_context.h"
33#include "dictionary/property/ngram_property.h"
34#include "dictionary/property/unigram_property.h"
35#include "dictionary/property/word_property.h"
36#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
37#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
38#include "dictionary/utils/forgetting_curve_utils.h"
39#include "dictionary/utils/multi_bigram_map.h"
40#include "dictionary/utils/probability_utils.h"
41
42namespace latinime {
43namespace backward {
44namespace v402 {
45
46// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
47// BinaryDictionaryDecayingTests.
48const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
49const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
50const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
51const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
52const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
53const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
54        Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
55const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
56
57void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
58        DicNodeVector *const childDicNodes) const {
59    if (!dicNode->hasChildren()) {
60        return;
61    }
62    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
63    readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
64    while (!readingHelper.isEnd()) {
65        const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
66        if (!ptNodeParams.isValid()) {
67            break;
68        }
69        bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
70        if (isTerminal && mHeaderPolicy->isDecayingDict()) {
71            // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
72            // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
73            // valid terminal DicNode.
74            isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
75        }
76        readingHelper.readNextSiblingNode(ptNodeParams);
77        if (ptNodeParams.representsNonWordInfo()) {
78            // Skip PtNodes that represent non-word information.
79            continue;
80        }
81        const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
82        childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
83                wordId, ptNodeParams.getCodePointArrayView());
84    }
85    if (readingHelper.isError()) {
86        mIsCorrupted = true;
87        AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
88    }
89}
90
91int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
92        const int maxCodePointCount, int *const outCodePoints) const {
93    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
94    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
95    readingHelper.initWithPtNodePos(ptNodePos);
96    const int codePointCount =  readingHelper.getCodePointsAndReturnCodePointCount(
97            maxCodePointCount, outCodePoints);
98    if (readingHelper.isError()) {
99        mIsCorrupted = true;
100        AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
101    }
102    return codePointCount;
103}
104
105int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
106        const bool forceLowerCaseSearch) const {
107    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
108    readingHelper.initWithPtNodeArrayPos(getRootPosition());
109    const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
110            wordCodePoints.size(), forceLowerCaseSearch);
111    if (readingHelper.isError()) {
112        mIsCorrupted = true;
113        AKLOGE("Dictionary reading error in getWordId().");
114    }
115    return getWordIdFromTerminalPtNodePos(ptNodePos);
116}
117
118const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
119        const WordIdArrayView prevWordIds, const int wordId,
120        MultiBigramMap *const multiBigramMap) const {
121    if (wordId == NOT_A_WORD_ID) {
122        return WordAttributes();
123    }
124    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
125    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
126    if (multiBigramMap) {
127        const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
128                prevWordIds, wordId, ptNodeParams.getProbability());
129        return getWordAttributes(probability, ptNodeParams);
130    }
131    if (!prevWordIds.empty()) {
132        const int probability = getProbabilityOfWord(prevWordIds, wordId);
133        if (probability != NOT_A_PROBABILITY) {
134            return getWordAttributes(probability, ptNodeParams);
135        }
136    }
137    return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
138            ptNodeParams);
139}
140
141const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability,
142        const PtNodeParams &ptNodeParams) const {
143    return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
144            ptNodeParams.getProbability() == 0);
145}
146
147int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
148        const int bigramProbability) const {
149    // In the v4 format, bigramProbability is a conditional probability.
150    const int bigramConditionalProbability = bigramProbability;
151    if (unigramProbability == NOT_A_PROBABILITY) {
152        return NOT_A_PROBABILITY;
153    }
154    if (bigramConditionalProbability == NOT_A_PROBABILITY) {
155        return ProbabilityUtils::backoff(unigramProbability);
156    }
157    return bigramConditionalProbability;
158}
159
160int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
161        const int wordId) const {
162    if (wordId == NOT_A_WORD_ID) {
163        return NOT_A_PROBABILITY;
164    }
165    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
166    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
167    if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) {
168        return NOT_A_PROBABILITY;
169    }
170    if (prevWordIds.empty()) {
171        return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
172    }
173    if (prevWordIds[0] == NOT_A_WORD_ID) {
174        return NOT_A_PROBABILITY;
175    }
176    const PtNodeParams prevWordPtNodeParams =
177            mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
178    if (prevWordPtNodeParams.isDeleted()) {
179        return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
180    }
181    const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
182            prevWordPtNodeParams.getTerminalId());
183    BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
184    while (bigramsIt.hasNext()) {
185        bigramsIt.next();
186        if (bigramsIt.getBigramPos() == ptNodePos
187                && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
188            const int bigramConditionalProbability = getBigramConditionalProbability(
189                    prevWordPtNodeParams.getProbability(),
190                    prevWordPtNodeParams.representsBeginningOfSentence(),
191                    bigramsIt.getProbability());
192            return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability);
193        }
194    }
195    return NOT_A_PROBABILITY;
196}
197
198void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
199        NgramListener *const listener) const {
200    if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) {
201        return;
202    }
203    const PtNodeParams prevWordPtNodeParams =
204            mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
205    if (prevWordPtNodeParams.isDeleted()) {
206        return;
207    }
208    const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
209            prevWordPtNodeParams.getTerminalId());
210    BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
211    while (bigramsIt.hasNext()) {
212        bigramsIt.next();
213        const int bigramConditionalProbability = getBigramConditionalProbability(
214                prevWordPtNodeParams.getProbability(),
215                prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability());
216        listener->onVisitEntry(bigramConditionalProbability,
217                getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
218    }
219}
220
221int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability,
222        const bool isInBeginningOfSentenceContext, const int bigramProbability) const {
223    if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
224        if (isInBeginningOfSentenceContext) {
225            return bigramProbability;
226        }
227        // Calculate conditional probability.
228        return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability,
229                MAX_PROBABILITY);
230    } else {
231        // bigramProbability is a conditional probability.
232        return bigramProbability;
233    }
234}
235
236BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
237        const int wordId) const {
238    const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
239    return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
240}
241
242int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
243    if (ptNodePos == NOT_A_DICT_POS) {
244        return NOT_A_DICT_POS;
245    }
246    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
247    if (ptNodeParams.isDeleted()) {
248        return NOT_A_DICT_POS;
249    }
250    return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
251            ptNodeParams.getTerminalId());
252}
253
254int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
255    if (ptNodePos == NOT_A_DICT_POS) {
256        return NOT_A_DICT_POS;
257    }
258    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
259    if (ptNodeParams.isDeleted()) {
260        return NOT_A_DICT_POS;
261    }
262    return mBuffers->getBigramDictContent()->getBigramListHeadPos(
263            ptNodeParams.getTerminalId());
264}
265
266bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
267        const UnigramProperty *const unigramProperty) {
268    if (!mBuffers->isUpdatable()) {
269        AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
270        return false;
271    }
272    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
273        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
274                mDictBuffer->getTailPosition());
275        return false;
276    }
277    if (wordCodePoints.size() > MAX_WORD_LENGTH) {
278        AKLOGE("The word is too long to insert to the dictionary, length: %zd",
279                wordCodePoints.size());
280        return false;
281    }
282    for (const auto &shortcut : unigramProperty->getShortcuts()) {
283        if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
284            AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
285                    shortcut.getTargetCodePoints()->size());
286            return false;
287        }
288    }
289    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
290    readingHelper.initWithPtNodeArrayPos(getRootPosition());
291    bool addedNewUnigram = false;
292    int codePointsToAdd[MAX_WORD_LENGTH];
293    int codePointCountToAdd = wordCodePoints.size();
294    memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
295    if (unigramProperty->representsBeginningOfSentence()) {
296        codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
297                codePointCountToAdd, MAX_WORD_LENGTH);
298    }
299    if (codePointCountToAdd <= 0) {
300        return false;
301    }
302    const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
303    if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
304            &addedNewUnigram)) {
305        if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
306            mEntryCounters.incrementNgramCount(NgramType::Unigram);
307        }
308        if (unigramProperty->getShortcuts().size() > 0) {
309            // Add shortcut target.
310            const int wordPos = getTerminalPtNodePosFromWordId(
311                    getWordId(codePointArrayView, false /* forceLowerCaseSearch */));
312            if (wordPos == NOT_A_DICT_POS) {
313                AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
314                return false;
315            }
316            for (const auto &shortcut : unigramProperty->getShortcuts()) {
317                if (!mUpdatingHelper.addShortcutTarget(wordPos,
318                        CodePointArrayView(*shortcut.getTargetCodePoints()),
319                        shortcut.getProbability())) {
320                    AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
321                            "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
322                            shortcut.getProbability());
323                    return false;
324                }
325            }
326        }
327        return true;
328    } else {
329        return false;
330    }
331}
332
333bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
334    if (!mBuffers->isUpdatable()) {
335        AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
336        return false;
337    }
338    const int ptNodePos = getTerminalPtNodePosFromWordId(
339            getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
340    if (ptNodePos == NOT_A_DICT_POS) {
341        return false;
342    }
343    const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
344    return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
345}
346
347bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
348    if (!mBuffers->isUpdatable()) {
349        AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
350        return false;
351    }
352    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
353        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
354                mDictBuffer->getTailPosition());
355        return false;
356    }
357    const NgramContext *const ngramContext = ngramProperty->getNgramContext();
358    if (!ngramContext->isValid()) {
359        AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
360        return false;
361    }
362    if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
363        AKLOGE("The word is too long to insert the ngram to the dictionary. "
364                "length: %zd", ngramProperty->getTargetCodePoints()->size());
365        return false;
366    }
367    WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
368    const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
369            false /* tryLowerCaseSearch */);
370    if (prevWordIds.empty()) {
371        return false;
372    }
373    if (prevWordIds[0] == NOT_A_WORD_ID) {
374        if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
375            const UnigramProperty beginningOfSentenceUnigramProperty(
376                    true /* representsBeginningOfSentence */, true /* isNotAWord */,
377                    false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
378            if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
379                    &beginningOfSentenceUnigramProperty)) {
380                AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
381                return false;
382            }
383            // Refresh word ids.
384            ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
385        } else {
386            return false;
387        }
388    }
389    const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
390            CodePointArrayView(*ngramProperty->getTargetCodePoints()),
391                    false /* forceLowerCaseSearch */));
392    if (wordPos == NOT_A_DICT_POS) {
393        return false;
394    }
395    bool addedNewBigram = false;
396    const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
397    if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
398            wordPos, ngramProperty, &addedNewBigram)) {
399        if (addedNewBigram) {
400            mEntryCounters.incrementNgramCount(NgramType::Bigram);
401        }
402        return true;
403    } else {
404        return false;
405    }
406}
407
408bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
409        const CodePointArrayView wordCodePoints) {
410    if (!mBuffers->isUpdatable()) {
411        AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
412        return false;
413    }
414    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
415        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
416                mDictBuffer->getTailPosition());
417        return false;
418    }
419    if (!ngramContext->isValid()) {
420        AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
421        return false;
422    }
423    if (wordCodePoints.size() > MAX_WORD_LENGTH) {
424        AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
425                wordCodePoints.size());
426    }
427    WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
428    const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
429            false /* tryLowerCaseSerch */);
430    if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
431        return false;
432    }
433    const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints,
434            false /* forceLowerCaseSearch */));
435    if (wordPos == NOT_A_DICT_POS) {
436        return false;
437    }
438    const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
439    if (mUpdatingHelper.removeNgramEntry(
440            PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) {
441        mEntryCounters.decrementNgramCount(NgramType::Bigram);
442        return true;
443    } else {
444        return false;
445    }
446}
447
448
449bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
450        const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
451        const bool isValidWord, const HistoricalInfo historicalInfo) {
452    if (!mBuffers->isUpdatable()) {
453        AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
454                "dictionary.");
455        return false;
456    }
457    const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
458    const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
459            false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
460    if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
461        AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
462        return false;
463    }
464    const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
465            ? NOT_A_PROBABILITY : probability;
466    const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram,
467            historicalInfo);
468    if (!addNgramEntry(&ngramProperty)) {
469        AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
470        return false;
471    }
472    return true;
473}
474
475bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
476    if (!mBuffers->isUpdatable()) {
477        AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
478        return false;
479    }
480    if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
481        AKLOGE("Cannot flush the dictionary to file.");
482        mIsCorrupted = true;
483        return false;
484    }
485    return true;
486}
487
488bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
489    if (!mBuffers->isUpdatable()) {
490        AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
491        return false;
492    }
493    if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
494        AKLOGE("Cannot flush the dictionary to file with GC.");
495        mIsCorrupted = true;
496        return false;
497    }
498    return true;
499}
500
501bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
502    if (!mBuffers->isUpdatable()) {
503        AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
504        return false;
505    }
506    if (mBuffers->isNearSizeLimit()) {
507        // Additional buffer size is near the limit.
508        return true;
509    } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
510            > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
511        // Total extended region size of the trie exceeds the limit.
512        return true;
513    } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
514            && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
515        // Needs to reduce dictionary size.
516        return true;
517    } else if (mHeaderPolicy->isDecayingDict()) {
518        return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
519                mHeaderPolicy);
520    }
521    return false;
522}
523
524void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
525        char *const outResult, const int maxResultLength) {
526    const int compareLength = queryLength + 1 /* terminator */;
527    if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
528        snprintf(outResult, maxResultLength, "%d",
529                mEntryCounters.getNgramCount(NgramType::Unigram));
530    } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
531        snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
532    } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
533        snprintf(outResult, maxResultLength, "%d",
534                mHeaderPolicy->isDecayingDict() ?
535                        ForgettingCurveUtils::getEntryCountHardLimit(
536                                mHeaderPolicy->getMaxNgramCounts().getNgramCount(
537                                        NgramType::Unigram)) :
538                        static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
539    } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
540        snprintf(outResult, maxResultLength, "%d",
541                mHeaderPolicy->isDecayingDict() ?
542                        ForgettingCurveUtils::getEntryCountHardLimit(
543                                mHeaderPolicy->getMaxNgramCounts().getNgramCount(
544                                        NgramType::Bigram)) :
545                        static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
546    }
547}
548
549const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
550        const CodePointArrayView wordCodePoints) const {
551    const int ptNodePos = getTerminalPtNodePosFromWordId(
552            getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
553    if (ptNodePos == NOT_A_DICT_POS) {
554        AKLOGE("getWordProperty is called for invalid word.");
555        return WordProperty();
556    }
557    const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
558    const ProbabilityEntry probabilityEntry =
559            mBuffers->getProbabilityDictContent()->getProbabilityEntry(
560                    ptNodeParams.getTerminalId());
561    const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
562    // Fetch bigram information.
563    std::vector<NgramProperty> ngrams;
564    const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
565    if (bigramListPos != NOT_A_DICT_POS) {
566        int bigramWord1CodePoints[MAX_WORD_LENGTH];
567        const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent();
568        const TerminalPositionLookupTable *const terminalPositionLookupTable =
569                mBuffers->getTerminalPositionLookupTable();
570        bool hasNext = true;
571        int readingPos = bigramListPos;
572        while (hasNext) {
573            const BigramEntry bigramEntry =
574                    bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
575            hasNext = bigramEntry.hasNext();
576            const int word1TerminalId = bigramEntry.getTargetTerminalId();
577            const int word1TerminalPtNodePos =
578                    terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId);
579            if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
580                continue;
581            }
582            const int codePointCount = getCodePointsAndReturnCodePointCount(
583                    getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
584                    bigramWord1CodePoints);
585            const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
586            const int rawBigramProbability = bigramEntry.hasHistoricalInfo()
587                    ? ForgettingCurveUtils::decodeProbability(
588                            bigramEntry.getHistoricalInfo(), mHeaderPolicy)
589                    : bigramEntry.getProbability();
590            const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(),
591                    ptNodeParams.representsBeginningOfSentence(), rawBigramProbability);
592            ngrams.emplace_back(
593                    NgramContext(wordCodePoints.data(), wordCodePoints.size(),
594                            ptNodeParams.representsBeginningOfSentence()),
595                    CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
596                    probability, *historicalInfo);
597        }
598    }
599    // Fetch shortcut information.
600    std::vector<UnigramProperty::ShortcutProperty> shortcuts;
601    int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
602    if (shortcutPos != NOT_A_DICT_POS) {
603        int shortcutTarget[MAX_WORD_LENGTH];
604        const ShortcutDictContent *const shortcutDictContent =
605                mBuffers->getShortcutDictContent();
606        bool hasNext = true;
607        while (hasNext) {
608            int shortcutTargetLength = 0;
609            int shortcutProbability = NOT_A_PROBABILITY;
610            shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
611                    &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
612            shortcuts.emplace_back(
613                    CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
614                    shortcutProbability);
615        }
616    }
617    const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
618            ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
619            ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
620    return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
621}
622
623int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
624        int *const outCodePointCount) {
625    *outCodePointCount = 0;
626    if (token == 0) {
627        mTerminalPtNodePositionsForIteratingWords.clear();
628        DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
629                &mTerminalPtNodePositionsForIteratingWords);
630        DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
631        readingHelper.initWithPtNodeArrayPos(getRootPosition());
632        readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
633    }
634    const int terminalPtNodePositionsVectorSize =
635            static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
636    if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
637        AKLOGE("Given token %d is invalid.", token);
638        return 0;
639    }
640    const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
641    *outCodePointCount = getCodePointsAndReturnCodePointCount(
642            getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
643    const int nextToken = token + 1;
644    if (nextToken >= terminalPtNodePositionsVectorSize) {
645        // All words have been iterated.
646        mTerminalPtNodePositionsForIteratingWords.clear();
647        return 0;
648    }
649    return nextToken;
650}
651
652int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
653    return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
654}
655
656int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
657    return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
658}
659
660} // namespace v402
661} // namespace backward
662} // namespace latinime
663