ver4_patricia_trie_policy.cpp revision 9069d30043d5182dfd38465ad9bbc11ad73fab7c
1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h"
18
19#include <vector>
20
21#include "suggest/core/dicnode/dic_node.h"
22#include "suggest/core/dicnode/dic_node_vector.h"
23#include "suggest/core/dictionary/ngram_listener.h"
24#include "suggest/core/dictionary/property/bigram_property.h"
25#include "suggest/core/dictionary/property/unigram_property.h"
26#include "suggest/core/dictionary/property/word_property.h"
27#include "suggest/core/session/prev_words_info.h"
28#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
29#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
30#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
31#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
32
33namespace latinime {
34
35// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
36// BinaryDictionaryDecayingTests.
37const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
38const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
39const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
40const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
41const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
42const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
43        Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
44
45void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
46        DicNodeVector *const childDicNodes) const {
47    if (!dicNode->hasChildren()) {
48        return;
49    }
50    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
51    readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
52    while (!readingHelper.isEnd()) {
53        const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
54        if (!ptNodeParams.isValid()) {
55            break;
56        }
57        bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
58        if (isTerminal && mHeaderPolicy->isDecayingDict()) {
59            // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
60            // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
61            // valid terminal DicNode.
62            isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
63        }
64        readingHelper.readNextSiblingNode(ptNodeParams);
65        if (ptNodeParams.representsNonWordInfo()) {
66            // Skip PtNodes that represent non-word information.
67            continue;
68        }
69        childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
70                ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
71                ptNodeParams.hasChildren(),
72                ptNodeParams.isBlacklisted()
73                        || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
74                ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
75    }
76    if (readingHelper.isError()) {
77        mIsCorrupted = true;
78        AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
79    }
80}
81
82int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
83        const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
84        int *const outUnigramProbability) const {
85    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
86    readingHelper.initWithPtNodePos(ptNodePos);
87    const int codePointCount =  readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
88            maxCodePointCount, outCodePoints, outUnigramProbability);
89    if (readingHelper.isError()) {
90        mIsCorrupted = true;
91        AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
92    }
93    return codePointCount;
94}
95
96int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord,
97        const int length, const bool forceLowerCaseSearch) const {
98    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
99    readingHelper.initWithPtNodeArrayPos(getRootPosition());
100    const int ptNodePos =
101            readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch);
102    if (readingHelper.isError()) {
103        mIsCorrupted = true;
104        AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
105    }
106    return ptNodePos;
107}
108
109int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
110        const int bigramProbability) const {
111    if (mHeaderPolicy->isDecayingDict()) {
112        // Both probabilities are encoded. Decode them and get probability.
113        return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
114    } else {
115        if (unigramProbability == NOT_A_PROBABILITY) {
116            return NOT_A_PROBABILITY;
117        } else if (bigramProbability == NOT_A_PROBABILITY) {
118            return ProbabilityUtils::backoff(unigramProbability);
119        } else {
120            return bigramProbability;
121        }
122    }
123}
124
125int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos,
126        const int ptNodePos) const {
127    if (ptNodePos == NOT_A_DICT_POS) {
128        return NOT_A_PROBABILITY;
129    }
130    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
131    if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
132        return NOT_A_PROBABILITY;
133    }
134    if (prevWordsPtNodePos) {
135        const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
136        BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
137        while (bigramsIt.hasNext()) {
138            bigramsIt.next();
139            if (bigramsIt.getBigramPos() == ptNodePos
140                    && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
141                return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
142            }
143        }
144        return NOT_A_PROBABILITY;
145    }
146    return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
147}
148
149void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos,
150        NgramListener *const listener) const {
151    if (!prevWordsPtNodePos) {
152        return;
153    }
154    const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
155    BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
156    while (bigramsIt.hasNext()) {
157        bigramsIt.next();
158        listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos());
159    }
160}
161
162int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
163    if (ptNodePos == NOT_A_DICT_POS) {
164        return NOT_A_DICT_POS;
165    }
166    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
167    if (ptNodeParams.isDeleted()) {
168        return NOT_A_DICT_POS;
169    }
170    return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
171            ptNodeParams.getTerminalId());
172}
173
174int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
175    if (ptNodePos == NOT_A_DICT_POS) {
176        return NOT_A_DICT_POS;
177    }
178    const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
179    if (ptNodeParams.isDeleted()) {
180        return NOT_A_DICT_POS;
181    }
182    return mBuffers->getBigramDictContent()->getBigramListHeadPos(
183            ptNodeParams.getTerminalId());
184}
185
186bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
187        const UnigramProperty *const unigramProperty) {
188    if (!mBuffers->isUpdatable()) {
189        AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
190        return false;
191    }
192    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
193        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
194                mDictBuffer->getTailPosition());
195        return false;
196    }
197    if (length > MAX_WORD_LENGTH) {
198        AKLOGE("The word is too long to insert to the dictionary, length: %d", length);
199        return false;
200    }
201    for (const auto &shortcut : unigramProperty->getShortcuts()) {
202        if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
203            AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d",
204                    shortcut.getTargetCodePoints()->size());
205            return false;
206        }
207    }
208    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
209    readingHelper.initWithPtNodeArrayPos(getRootPosition());
210    bool addedNewUnigram = false;
211    int codePointsToAdd[MAX_WORD_LENGTH];
212    int codePointCountToAdd = length;
213    memmove(codePointsToAdd, word, sizeof(int) * length);
214    if (unigramProperty->representsBeginningOfSentence()) {
215        codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
216                codePointCountToAdd, MAX_WORD_LENGTH);
217    }
218    if (codePointCountToAdd <= 0) {
219        return false;
220    }
221    if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd,
222            unigramProperty, &addedNewUnigram)) {
223        if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
224            mUnigramCount++;
225        }
226        if (unigramProperty->getShortcuts().size() > 0) {
227            // Add shortcut target.
228            const int wordPos = getTerminalPtNodePositionOfWord(word, length,
229                    false /* forceLowerCaseSearch */);
230            if (wordPos == NOT_A_DICT_POS) {
231                AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
232                return false;
233            }
234            for (const auto &shortcut : unigramProperty->getShortcuts()) {
235                if (!mUpdatingHelper.addShortcutTarget(wordPos,
236                        shortcut.getTargetCodePoints()->data(),
237                        shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) {
238                    AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, "
239                            "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
240                            shortcut.getProbability());
241                    return false;
242                }
243            }
244        }
245        return true;
246    } else {
247        return false;
248    }
249}
250
251bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) {
252    if (!mBuffers->isUpdatable()) {
253        AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
254        return false;
255    }
256    const int ptNodePos = getTerminalPtNodePositionOfWord(word, length,
257            false /* forceLowerCaseSearch */);
258    if (ptNodePos == NOT_A_DICT_POS) {
259        return false;
260    }
261    const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
262    if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
263        AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
264        return false;
265    }
266    if (!ptNodeParams.representsNonWordInfo()) {
267        mUnigramCount--;
268    }
269    return true;
270}
271
272bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
273        const BigramProperty *const bigramProperty) {
274    if (!mBuffers->isUpdatable()) {
275        AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
276        return false;
277    }
278    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
279        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
280                mDictBuffer->getTailPosition());
281        return false;
282    }
283    if (!prevWordsInfo->isValid()) {
284        AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
285        return false;
286    }
287    if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
288        AKLOGE("The word is too long to insert the ngram to the dictionary. "
289                "length: %d", bigramProperty->getTargetCodePoints()->size());
290        return false;
291    }
292    int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
293    prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
294            false /* tryLowerCaseSearch */);
295    const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos);
296    // TODO: Support N-gram.
297    if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
298        if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
299            const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
300            const UnigramProperty beginningOfSentenceUnigramProperty(
301                    true /* representsBeginningOfSentence */, true /* isNotAWord */,
302                    false /* isBlacklisted */, MAX_PROBABILITY /* probability */,
303                    NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
304            if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */),
305                    prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */),
306                    &beginningOfSentenceUnigramProperty)) {
307                AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
308                return false;
309            }
310            // Refresh Terminal PtNode positions.
311            prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
312                    false /* tryLowerCaseSearch */);
313        } else {
314            return false;
315        }
316    }
317    const int word1Pos = getTerminalPtNodePositionOfWord(
318            bigramProperty->getTargetCodePoints()->data(),
319            bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */);
320    if (word1Pos == NOT_A_DICT_POS) {
321        return false;
322    }
323    bool addedNewEntry = false;
324    if (mUpdatingHelper.addNgramEntry(prevWordsPtNodePosView, word1Pos, bigramProperty,
325            &addedNewEntry)) {
326        if (addedNewEntry) {
327            mBigramCount++;
328        }
329        return true;
330    } else {
331        return false;
332    }
333}
334
335bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
336        const int *const word, const int length) {
337    if (!mBuffers->isUpdatable()) {
338        AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
339        return false;
340    }
341    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
342        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
343                mDictBuffer->getTailPosition());
344        return false;
345    }
346    if (!prevWordsInfo->isValid()) {
347        AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary.");
348        return false;
349    }
350    if (length > MAX_WORD_LENGTH) {
351        AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length);
352    }
353    int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
354    prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
355            false /* tryLowerCaseSerch */);
356    const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos);
357    // TODO: Support N-gram.
358    if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
359        return false;
360    }
361    const int wordPos = getTerminalPtNodePositionOfWord(word, length,
362            false /* forceLowerCaseSearch */);
363    if (wordPos == NOT_A_DICT_POS) {
364        return false;
365    }
366    if (mUpdatingHelper.removeNgramEntry(prevWordsPtNodePosView, wordPos)) {
367        mBigramCount--;
368        return true;
369    } else {
370        return false;
371    }
372}
373
374bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
375    if (!mBuffers->isUpdatable()) {
376        AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
377        return false;
378    }
379    if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) {
380        AKLOGE("Cannot flush the dictionary to file.");
381        mIsCorrupted = true;
382        return false;
383    }
384    return true;
385}
386
387bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
388    if (!mBuffers->isUpdatable()) {
389        AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
390        return false;
391    }
392    if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
393        AKLOGE("Cannot flush the dictionary to file with GC.");
394        mIsCorrupted = true;
395        return false;
396    }
397    return true;
398}
399
400bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
401    if (!mBuffers->isUpdatable()) {
402        AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
403        return false;
404    }
405    if (mBuffers->isNearSizeLimit()) {
406        // Additional buffer size is near the limit.
407        return true;
408    } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
409            > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
410        // Total extended region size of the trie exceeds the limit.
411        return true;
412    } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
413            && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
414        // Needs to reduce dictionary size.
415        return true;
416    } else if (mHeaderPolicy->isDecayingDict()) {
417        return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount,
418                mHeaderPolicy);
419    }
420    return false;
421}
422
423void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
424        char *const outResult, const int maxResultLength) {
425    const int compareLength = queryLength + 1 /* terminator */;
426    if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
427        snprintf(outResult, maxResultLength, "%d", mUnigramCount);
428    } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
429        snprintf(outResult, maxResultLength, "%d", mBigramCount);
430    } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
431        snprintf(outResult, maxResultLength, "%d",
432                mHeaderPolicy->isDecayingDict() ?
433                        ForgettingCurveUtils::getUnigramCountHardLimit(
434                                mHeaderPolicy->getMaxUnigramCount()) :
435                        static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
436    } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
437        snprintf(outResult, maxResultLength, "%d",
438                mHeaderPolicy->isDecayingDict() ?
439                        ForgettingCurveUtils::getBigramCountHardLimit(
440                                mHeaderPolicy->getMaxBigramCount()) :
441                        static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
442    }
443}
444
445const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints,
446        const int codePointCount) const {
447    const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount,
448            false /* forceLowerCaseSearch */);
449    if (ptNodePos == NOT_A_DICT_POS) {
450        AKLOGE("getWordProperty is called for invalid word.");
451        return WordProperty();
452    }
453    const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
454    std::vector<int> codePointVector(ptNodeParams.getCodePoints(),
455            ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
456    const ProbabilityEntry probabilityEntry =
457            mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
458                    ptNodeParams.getTerminalId());
459    const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
460    // Fetch bigram information.
461    std::vector<BigramProperty> bigrams;
462    const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
463    if (bigramListPos != NOT_A_DICT_POS) {
464        int bigramWord1CodePoints[MAX_WORD_LENGTH];
465        const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent();
466        const TerminalPositionLookupTable *const terminalPositionLookupTable =
467                mBuffers->getTerminalPositionLookupTable();
468        bool hasNext = true;
469        int readingPos = bigramListPos;
470        while (hasNext) {
471            const BigramEntry bigramEntry =
472                    bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
473            hasNext = bigramEntry.hasNext();
474            const int word1TerminalId = bigramEntry.getTargetTerminalId();
475            const int word1TerminalPtNodePos =
476                    terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId);
477            if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
478                continue;
479            }
480            // Word (unigram) probability
481            int word1Probability = NOT_A_PROBABILITY;
482            const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
483                    word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints,
484                    &word1Probability);
485            const std::vector<int> word1(bigramWord1CodePoints,
486                    bigramWord1CodePoints + codePointCount);
487            const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
488            const int probability = bigramEntry.hasHistoricalInfo() ?
489                    ForgettingCurveUtils::decodeProbability(
490                            bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
491                    bigramEntry.getProbability();
492            bigrams.emplace_back(&word1, probability,
493                    historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
494                    historicalInfo->getCount());
495        }
496    }
497    // Fetch shortcut information.
498    std::vector<UnigramProperty::ShortcutProperty> shortcuts;
499    int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
500    if (shortcutPos != NOT_A_DICT_POS) {
501        int shortcutTarget[MAX_WORD_LENGTH];
502        const ShortcutDictContent *const shortcutDictContent =
503                mBuffers->getShortcutDictContent();
504        bool hasNext = true;
505        while (hasNext) {
506            int shortcutTargetLength = 0;
507            int shortcutProbability = NOT_A_PROBABILITY;
508            shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
509                    &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
510            const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength);
511            shortcuts.emplace_back(&target, shortcutProbability);
512        }
513    }
514    const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
515            ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
516            historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
517            historicalInfo->getCount(), &shortcuts);
518    return WordProperty(&codePointVector, &unigramProperty, &bigrams);
519}
520
521int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
522        int *const outCodePointCount) {
523    *outCodePointCount = 0;
524    if (token == 0) {
525        mTerminalPtNodePositionsForIteratingWords.clear();
526        DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
527                &mTerminalPtNodePositionsForIteratingWords);
528        DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
529        readingHelper.initWithPtNodeArrayPos(getRootPosition());
530        readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
531    }
532    const int terminalPtNodePositionsVectorSize =
533            static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
534    if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
535        AKLOGE("Given token %d is invalid.", token);
536        return 0;
537    }
538    const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
539    int unigramProbability = NOT_A_PROBABILITY;
540    *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
541            terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
542    const int nextToken = token + 1;
543    if (nextToken >= terminalPtNodePositionsVectorSize) {
544        // All words have been iterated.
545        mTerminalPtNodePositionsForIteratingWords.clear();
546        return 0;
547    }
548    return nextToken;
549}
550
551} // namespace latinime
552