1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "dictionary/structure/v2/patricia_trie_policy.h"
18
19#include "defines.h"
20#include "suggest/core/dicnode/dic_node.h"
21#include "suggest/core/dicnode/dic_node_vector.h"
22#include "dictionary/interface/ngram_listener.h"
23#include "dictionary/property/ngram_context.h"
24#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
25#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
26#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
27#include "dictionary/utils/multi_bigram_map.h"
28#include "dictionary/utils/probability_utils.h"
29#include "utils/char_utils.h"
30
31namespace latinime {
32
33void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
34        DicNodeVector *const childDicNodes) const {
35    if (!dicNode->hasChildren()) {
36        return;
37    }
38    int nextPos = dicNode->getChildrenPtNodeArrayPos();
39    if (!isValidPos(nextPos)) {
40        AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
41                nextPos, mBuffer.size());
42        mIsCorrupted = true;
43        ASSERT(false);
44        return;
45    }
46    const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
47            mBuffer.data(), &nextPos);
48    for (int i = 0; i < childCount; i++) {
49        if (!isValidPos(nextPos)) {
50            AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
51                    nextPos, mBuffer.size(), i, childCount);
52            mIsCorrupted = true;
53            ASSERT(false);
54            return;
55        }
56        nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
57    }
58}
59
60int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
61        const int maxCodePointCount, int *const outCodePoints) const {
62    return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount,
63            outCodePoints, nullptr /* outUnigramProbability */);
64}
65// This retrieves code points and the probability of the word by its id.
66// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
67// it is possible to check for this with advantageous complexity. For each PtNode array, we search
68// for PtNodes with children and compare the children position with the position we look for.
69// When we shoot the position we look for, it means the word we look for is in the children
70// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a
71// PtNode array with the last PtNode's children position still less than what we are searching for,
72// we must descend the last PtNode's children (for example, if the word we are searching for starts
73// with a z, it's the last PtNode of the root array, so all children addresses will be smaller
74// than the position we look for, and we have to descend the z PtNode).
75/* Parameters :
76 * wordId: Id of the word we are searching for.
77 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
78 * outUnigramProbability: a pointer to an int to write the probability into.
79 * Return value : the code point count, of 0 if the word was not found.
80 */
81// TODO: Split this function to be more readable
82int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
83        const int wordId, const int maxCodePointCount, int *const outCodePoints,
84        int *const outUnigramProbability) const {
85    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
86    int pos = getRootPosition();
87    int wordPos = 0;
88    const int *const codePointTable = mHeaderPolicy.getCodePointTable();
89    if (outUnigramProbability) {
90        *outUnigramProbability = NOT_A_PROBABILITY;
91    }
92    // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
93    // only traverse PtNodes that are actually a part of the terminal we are searching, so each
94    // time we enter this loop we are one depth level further than last time.
95    // The only reason we count PtNodes is because we want to reduce the probability of infinite
96    // looping in case there is a bug. Since we know there is an upper bound to the depth we are
97    // supposed to traverse, it does not hurt to count iterations.
98    for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
99        int lastCandidatePtNodePos = 0;
100        // Let's loop through PtNodes in this PtNode array searching for either the terminal
101        // or one of its ascendants.
102        if (!isValidPos(pos)) {
103            AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
104                    pos, mBuffer.size());
105            mIsCorrupted = true;
106            ASSERT(false);
107            return 0;
108        }
109        for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
110                mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
111            const int startPos = pos;
112            if (!isValidPos(pos)) {
113                AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
114                mIsCorrupted = true;
115                ASSERT(false);
116                return 0;
117            }
118            const PatriciaTrieReadingUtils::NodeFlags flags =
119                    PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
120            const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
121                    mBuffer.data(), codePointTable, &pos);
122            if (ptNodePos == startPos) {
123                // We found the position. Copy the rest of the code points in the buffer and return
124                // the length.
125                outCodePoints[wordPos] = character;
126                if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
127                    int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
128                            mBuffer.data(), codePointTable, &pos);
129                    // We count code points in order to avoid infinite loops if the file is broken
130                    // or if there is some other bug
131                    int charCount = maxCodePointCount;
132                    while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
133                        outCodePoints[++wordPos] = nextChar;
134                        nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
135                                mBuffer.data(), codePointTable, &pos);
136                    }
137                }
138                if (outUnigramProbability) {
139                    *outUnigramProbability =
140                            PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
141                                    mBuffer.data(), &pos);
142                }
143                return ++wordPos;
144            }
145            // We need to skip past this PtNode, so skip any remaining code points after the
146            // first and possibly the probability.
147            if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
148                PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
149                        codePointTable, &pos);
150            }
151            if (PatriciaTrieReadingUtils::isTerminal(flags)) {
152                PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
153            }
154            // The fact that this PtNode has children is very important. Since we already know
155            // that this PtNode does not match, if it has no children we know it is irrelevant
156            // to what we are searching for.
157            const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
158            // We will write in `found' whether we have passed the children position we are
159            // searching for. For example if we search for "beer", the children of b are less
160            // than the address we are searching for and the children of c are greater. When we
161            // come here for c, we realize this is too big, and that we should descend b.
162            bool found;
163            if (hasChildren) {
164                int currentPos = pos;
165                // Here comes the tricky part. First, read the children position.
166                const int childrenPos = PatriciaTrieReadingUtils
167                        ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
168                                &currentPos);
169                if (childrenPos > ptNodePos) {
170                    // If the children pos is greater than the position, it means the previous
171                    // PtNode, which position is stored in lastCandidatePtNodePos, was the right
172                    // one.
173                    found = true;
174                } else if (1 >= ptNodeCount) {
175                    // However if we are on the LAST PtNode of this array, and we have NOT shot the
176                    // position we should descend THIS PtNode. So we trick the
177                    // lastCandidatePtNodePos so that we will descend this PtNode, not the previous
178                    // one.
179                    lastCandidatePtNodePos = startPos;
180                    found = true;
181                } else {
182                    // Else, we should continue looking.
183                    found = false;
184                }
185            } else {
186                // Even if we don't have children here, we could still be on the last PtNode of
187                // this array. If this is the case, we should descend the last PtNode that had
188                // children, and their position is already in lastCandidatePtNodePos.
189                found = (1 >= ptNodeCount);
190            }
191
192            if (found) {
193                // Okay, we found the PtNode we should descend. Its position is in
194                // the lastCandidatePtNodePos variable, so we just re-read it.
195                if (0 != lastCandidatePtNodePos) {
196                    const PatriciaTrieReadingUtils::NodeFlags lastFlags =
197                            PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
198                                    mBuffer.data(), &lastCandidatePtNodePos);
199                    const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
200                            mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
201                    // We copy all the characters in this PtNode to the buffer
202                    outCodePoints[wordPos] = lastChar;
203                    if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
204                        int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
205                                mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
206                        int charCount = maxCodePointCount;
207                        while (-1 != nextChar && --charCount > 0) {
208                            outCodePoints[++wordPos] = nextChar;
209                            nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
210                                    mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
211                        }
212                    }
213                    ++wordPos;
214                    // Now we only need to branch to the children address. Skip the probability if
215                    // it's there, read pos, and break to resume the search at pos.
216                    if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
217                        PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
218                                &lastCandidatePtNodePos);
219                    }
220                    pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
221                            mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
222                    break;
223                } else {
224                    // Here is a little tricky part: we come here if we found out that all children
225                    // addresses in this PtNode are bigger than the address we are searching for.
226                    // Should we conclude the word is not in the dictionary? No! It could still be
227                    // one of the remaining PtNodes in this array, so we have to keep looking in
228                    // this array until we find it (or we realize it's not there either, in which
229                    // case it's actually not in the dictionary). Pass the end of this PtNode,
230                    // ready to start the next one.
231                    if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
232                        PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
233                                mBuffer.data(), flags, &pos);
234                    }
235                    if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
236                        mShortcutListPolicy.skipAllShortcuts(&pos);
237                    }
238                    if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
239                        if (!mBigramListPolicy.skipAllBigrams(&pos)) {
240                            AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
241                                    pos);
242                            mIsCorrupted = true;
243                            ASSERT(false);
244                            return 0;
245                        }
246                    }
247                }
248            } else {
249                // If we did not find it, we should record the last children address for the next
250                // iteration.
251                if (hasChildren) lastCandidatePtNodePos = startPos;
252                // Now skip the end of this PtNode (children pos and the attributes if any) so that
253                // our pos is after the end of this PtNode, at the start of the next one.
254                if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
255                    PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
256                            mBuffer.data(), flags, &pos);
257                }
258                if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
259                    mShortcutListPolicy.skipAllShortcuts(&pos);
260                }
261                if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
262                    if (!mBigramListPolicy.skipAllBigrams(&pos)) {
263                        AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
264                        mIsCorrupted = true;
265                        ASSERT(false);
266                        return 0;
267                    }
268                }
269            }
270
271        }
272    }
273    // If we have looked through all the PtNodes and found no match, the ptNodePos is
274    // not the position of a terminal in this dictionary.
275    return 0;
276}
277
278// This function gets the position of the terminal PtNode of the exact matching word in the
279// dictionary. If no match is found, it returns NOT_A_WORD_ID.
280int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
281        const bool forceLowerCaseSearch) const {
282    DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
283    readingHelper.initWithPtNodeArrayPos(getRootPosition());
284    const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
285            wordCodePoints.size(), forceLowerCaseSearch);
286    if (readingHelper.isError()) {
287        mIsCorrupted = true;
288        AKLOGE("Dictionary reading error in getWordId().");
289    }
290    return getWordIdFromTerminalPtNodePos(ptNodePos);
291}
292
293const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
294        const WordIdArrayView prevWordIds, const int wordId,
295        MultiBigramMap *const multiBigramMap) const {
296    if (wordId == NOT_A_WORD_ID) {
297        return WordAttributes();
298    }
299    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
300    const PtNodeParams ptNodeParams =
301            mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
302    if (multiBigramMap) {
303        const int probability =  multiBigramMap->getBigramProbability(this /* structurePolicy */,
304                prevWordIds, wordId, ptNodeParams.getProbability());
305        return getWordAttributes(probability, ptNodeParams);
306    }
307    if (!prevWordIds.empty()) {
308        const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId);
309        if (bigramProbability != NOT_A_PROBABILITY) {
310            return getWordAttributes(bigramProbability, ptNodeParams);
311        }
312    }
313    return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
314            ptNodeParams);
315}
316
317const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
318        const PtNodeParams &ptNodeParams) const {
319    return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
320            ptNodeParams.isPossiblyOffensive());
321}
322
323int PatriciaTriePolicy::getProbability(const int unigramProbability,
324        const int bigramProbability) const {
325    // Due to space constraints, the probability for bigrams is approximate - the lower the unigram
326    // probability, the worse the precision. The theoritical maximum error in resulting probability
327    // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means
328    // that sometimes, we'll see some bigrams interverted here, but it can't get too bad.
329    if (unigramProbability == NOT_A_PROBABILITY) {
330        return NOT_A_PROBABILITY;
331    } else if (bigramProbability == NOT_A_PROBABILITY) {
332        return ProbabilityUtils::backoff(unigramProbability);
333    } else {
334        return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
335                bigramProbability);
336    }
337}
338
339int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
340        const int wordId) const {
341    if (wordId == NOT_A_WORD_ID) {
342        return NOT_A_PROBABILITY;
343    }
344    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
345    const PtNodeParams ptNodeParams =
346            mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
347    if (ptNodeParams.isNotAWord()) {
348        // If this is not a word, it should behave as having no probability outside of the
349        // suggestion process (where it should be used for shortcuts).
350        return NOT_A_PROBABILITY;
351    }
352    if (!prevWordIds.empty()) {
353        const int bigramsPosition = getBigramsPositionOfPtNode(
354                getTerminalPtNodePosFromWordId(prevWordIds[0]));
355        BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
356        while (bigramsIt.hasNext()) {
357            bigramsIt.next();
358            if (bigramsIt.getBigramPos() == ptNodePos
359                    && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
360                return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
361            }
362        }
363        return NOT_A_PROBABILITY;
364    }
365    return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
366}
367
368void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
369        NgramListener *const listener) const {
370    if (prevWordIds.empty()) {
371        return;
372    }
373    const int bigramsPosition = getBigramsPositionOfPtNode(
374            getTerminalPtNodePosFromWordId(prevWordIds[0]));
375    BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
376    while (bigramsIt.hasNext()) {
377        bigramsIt.next();
378        listener->onVisitEntry(bigramsIt.getProbability(),
379                getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
380    }
381}
382
383BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const {
384    const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
385    return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos);
386}
387
388int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
389    if (ptNodePos == NOT_A_DICT_POS) {
390        return NOT_A_DICT_POS;
391    }
392    return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
393}
394
395int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
396    if (ptNodePos == NOT_A_DICT_POS) {
397        return NOT_A_DICT_POS;
398    }
399    return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos();
400}
401
402int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
403        const int ptNodePos, DicNodeVector *childDicNodes) const {
404    PatriciaTrieReadingUtils::NodeFlags flags;
405    int mergedNodeCodePointCount = 0;
406    int mergedNodeCodePoints[MAX_WORD_LENGTH];
407    int probability = NOT_A_PROBABILITY;
408    int childrenPos = NOT_A_DICT_POS;
409    int shortcutPos = NOT_A_DICT_POS;
410    int bigramPos = NOT_A_DICT_POS;
411    int siblingPos = NOT_A_DICT_POS;
412    const int *const codePointTable = mHeaderPolicy.getCodePointTable();
413    PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
414            &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
415            mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
416            &siblingPos);
417    // Skip PtNodes don't start with Unicode code point because they represent non-word information.
418    if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
419        const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
420        childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId,
421                CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount));
422    }
423    return siblingPos;
424}
425
426const WordProperty PatriciaTriePolicy::getWordProperty(
427        const CodePointArrayView wordCodePoints) const {
428    const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
429    if (wordId == NOT_A_WORD_ID) {
430        AKLOGE("getWordProperty was called for invalid word.");
431        return WordProperty();
432    }
433    const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
434    const PtNodeParams ptNodeParams =
435            mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
436    // Fetch bigram information.
437    std::vector<NgramProperty> ngrams;
438    const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
439    int bigramWord1CodePoints[MAX_WORD_LENGTH];
440    BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
441    while (bigramsIt.hasNext()) {
442        // Fetch the next bigram information and forward the iterator.
443        bigramsIt.next();
444        // Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
445        if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
446            int word1Probability = NOT_A_PROBABILITY;
447            const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
448                    getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
449                    bigramWord1CodePoints, &word1Probability);
450            const int probability = getProbability(word1Probability, bigramsIt.getProbability());
451            ngrams.emplace_back(
452                    NgramContext(wordCodePoints.data(), wordCodePoints.size(),
453                            ptNodeParams.representsBeginningOfSentence()),
454                    CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
455                    probability, HistoricalInfo());
456        }
457    }
458    // Fetch shortcut information.
459    std::vector<UnigramProperty::ShortcutProperty> shortcuts;
460    int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
461    if (shortcutPos != NOT_A_DICT_POS) {
462        int shortcutTargetCodePoints[MAX_WORD_LENGTH];
463        ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos);
464        bool hasNext = true;
465        while (hasNext) {
466            const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
467                    ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos);
468            hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
469            const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
470                    mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
471            const int shortcutProbability =
472                    ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
473            shortcuts.emplace_back(
474                    CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(),
475                    shortcutProbability);
476        }
477    }
478    const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
479            ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
480            ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
481    return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
482}
483
484int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
485        int *const outCodePointCount) {
486    *outCodePointCount = 0;
487    if (token == 0) {
488        // Start iterating the dictionary.
489        mTerminalPtNodePositionsForIteratingWords.clear();
490        DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
491                &mTerminalPtNodePositionsForIteratingWords);
492        DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
493        readingHelper.initWithPtNodeArrayPos(getRootPosition());
494        readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
495    }
496    const int terminalPtNodePositionsVectorSize =
497            static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
498    if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
499        AKLOGE("Given token %d is invalid.", token);
500        return 0;
501    }
502    const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
503    *outCodePointCount = getCodePointsAndReturnCodePointCount(
504            getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
505    const int nextToken = token + 1;
506    if (nextToken >= terminalPtNodePositionsVectorSize) {
507        // All words have been iterated.
508        mTerminalPtNodePositionsForIteratingWords.clear();
509        return 0;
510    }
511    return nextToken;
512}
513
514int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
515    return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
516}
517
518int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
519    return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
520}
521
522bool PatriciaTriePolicy::isValidPos(const int pos) const {
523    return pos >= 0 && pos < static_cast<int>(mBuffer.size());
524}
525
526} // namespace latinime
527