dynamic_pt_updating_helper.cpp revision 9069d30043d5182dfd38465ad9bbc11ad73fab7c
1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
18
19#include "suggest/core/dictionary/property/unigram_property.h"
20#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
21#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
22#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
23#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
24#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
25#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
26
27namespace latinime {
28
29const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
30
31bool DynamicPtUpdatingHelper::addUnigramWord(
32        DynamicPtReadingHelper *const readingHelper,
33        const int *const wordCodePoints, const int codePointCount,
34        const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
35    int parentPos = NOT_A_DICT_POS;
36    while (!readingHelper->isEnd()) {
37        const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
38        if (!ptNodeParams.isValid()) {
39            break;
40        }
41        const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
42        if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
43                wordCodePoints[matchedCodePointCount])) {
44            // The first code point is different from target code point. Skip this node and read
45            // the next sibling node.
46            readingHelper->readNextSiblingNode(ptNodeParams);
47            continue;
48        }
49        // Check following merged node code points.
50        const int nodeCodePointCount = ptNodeParams.getCodePointCount();
51        for (int j = 1; j < nodeCodePointCount; ++j) {
52            const int nextIndex = matchedCodePointCount + j;
53            if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
54                    wordCodePoints[matchedCodePointCount + j])) {
55                *outAddedNewUnigram = true;
56                return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
57                        wordCodePoints + matchedCodePointCount,
58                        codePointCount - matchedCodePointCount);
59            }
60        }
61        // All characters are matched.
62        if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) {
63            return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
64        }
65        if (!ptNodeParams.hasChildren()) {
66            *outAddedNewUnigram = true;
67            return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
68                    wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams),
69                    codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams));
70        }
71        // Advance to the children nodes.
72        parentPos = ptNodeParams.getHeadPos();
73        readingHelper->readChildNode(ptNodeParams);
74    }
75    if (readingHelper->isError()) {
76        // The dictionary is invalid.
77        return false;
78    }
79    int pos = readingHelper->getPosOfLastForwardLinkField();
80    *outAddedNewUnigram = true;
81    return createAndInsertNodeIntoPtNodeArray(parentPos,
82            wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
83            codePointCount - readingHelper->getPrevTotalCodePointCount(),
84            unigramProperty, &pos);
85}
86
87bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
88        const int wordPos, const BigramProperty *const bigramProperty,
89        bool *const outAddedNewEntry) {
90    if (prevWordsPtNodePos.empty()) {
91        return false;
92    }
93    ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
94    int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
95    for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
96        prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
97                prevWordsPtNodePos[i]).getTerminalId();
98    }
99    const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
100    const int wordId =
101            mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
102    return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, outAddedNewEntry);
103}
104
105bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
106        const int wordPos) {
107    if (prevWordsPtNodePos.empty()) {
108        return false;
109    }
110    ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
111    int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
112    for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
113        prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
114                prevWordsPtNodePos[i]).getTerminalId();
115    }
116    const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
117    const int wordId =
118            mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
119    return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
120}
121
122bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
123        const int *const targetCodePoints, const int targetCodePointCount,
124        const int shortcutProbability) {
125    const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
126    return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount,
127            shortcutProbability);
128}
129
130bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
131        const int *const nodeCodePoints, const int nodeCodePointCount,
132        const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) {
133    const int newPtNodeArrayPos = mBuffer->getTailPosition();
134    if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
135            newPtNodeArrayPos, forwardLinkFieldPos)) {
136        return false;
137    }
138    return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount,
139            unigramProperty);
140}
141
142bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
143        const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
144    if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
145        // Overwrites the probability.
146        *outAddedNewUnigram = false;
147        return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
148    } else {
149        // Make the node terminal and write the probability.
150        *outAddedNewUnigram = true;
151        const int movedPos = mBuffer->getTailPosition();
152        int writingPos = movedPos;
153        const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
154                unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
155                true /* isTerminal */, originalPtNodeParams->getParentPos(),
156                originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(),
157                unigramProperty->getProbability()));
158        if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
159                unigramProperty, &writingPos)) {
160            return false;
161        }
162        if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
163            return false;
164        }
165    }
166    return true;
167}
168
169bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
170        const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
171        const int *const codePoints, const int codePointCount) {
172    const int newPtNodeArrayPos = mBuffer->getTailPosition();
173    if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
174        return false;
175    }
176    return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
177            codePointCount, unigramProperty);
178}
179
180bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
181        const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount,
182        const UnigramProperty *const unigramProperty) {
183    int writingPos = mBuffer->getTailPosition();
184    if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
185            1 /* arraySize */, &writingPos)) {
186        return false;
187    }
188    const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
189            unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */,
190            parentPtNodePos, nodeCodePointCount, nodeCodePoints,
191            unigramProperty->getProbability()));
192    if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
193            unigramProperty, &writingPos)) {
194        return false;
195    }
196    if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
197            NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
198        return false;
199    }
200    return true;
201}
202
203// Returns whether the dictionary updating was succeeded or not.
204bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
205        const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount,
206        const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints,
207        const int newNodeCodePointCount) {
208    // When addsExtraChild is true, split the reallocating PtNode and add new child.
209    // Reallocating PtNode: abcde, newNode: abcxy.
210    // abc (1st, not terminal) __ de (2nd)
211    //                         \_ xy (extra child, terminal)
212    // Otherwise, this method makes 1st part terminal and write information in unigramProperty.
213    // Reallocating PtNode: abcde, newNode: abc.
214    // abc (1st, terminal) __ de (2nd)
215    const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount;
216    const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
217    int writingPos = firstPartOfReallocatedPtNodePos;
218    // Write the 1st part of the reallocating node. The children position will be updated later
219    // with actual children position.
220    if (addsExtraChild) {
221        const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
222                false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */,
223                reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount,
224                reallocatingPtNodeParams->getCodePoints(), NOT_A_PROBABILITY));
225        if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
226            return false;
227        }
228    } else {
229        const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
230                unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
231                true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
232                overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(),
233                unigramProperty->getProbability()));
234        if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
235                unigramProperty, &writingPos)) {
236            return false;
237        }
238    }
239    const int actualChildrenPos = writingPos;
240    // Create new children PtNode array.
241    const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
242    if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
243            newPtNodeCount, &writingPos)) {
244        return false;
245    }
246    // Write the 2nd part of the reallocating node.
247    const int secondPartOfReallocatedPtNodePos = writingPos;
248    const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
249            reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(),
250            reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
251            reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount,
252            reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount,
253            reallocatingPtNodeParams->getProbability()));
254    if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
255        return false;
256    }
257    if (addsExtraChild) {
258        const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
259                unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
260                true /* isTerminal */, firstPartOfReallocatedPtNodePos,
261                newNodeCodePointCount - overlappingCodePointCount,
262                newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability()));
263        if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
264                unigramProperty, &writingPos)) {
265            return false;
266        }
267    }
268    if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
269            NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
270        return false;
271    }
272    // Update original reallocating PtNode as moved.
273    if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
274            secondPartOfReallocatedPtNodePos)) {
275        return false;
276    }
277    // Load node info. Information of the 1st part will be fetched.
278    const PtNodeParams ptNodeParams(
279            mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
280    // Update children position.
281    return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
282}
283
284const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
285        const PtNodeParams *const originalPtNodeParams,
286        const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos,
287        const int codePointCount, const int *const codePoints, const int probability) const {
288    const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
289            isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
290            false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */,
291            CHILDREN_POSITION_FIELD_SIZE);
292    return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
293            probability);
294}
295
296const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(
297        const bool isNotAWord, const bool isBlacklisted, const bool isTerminal,
298        const int parentPos, const int codePointCount, const int *const codePoints,
299        const int probability) const {
300    const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
301            isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
302            false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */,
303            CHILDREN_POSITION_FIELD_SIZE);
304    return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability);
305}
306
307} // namespace latinime
308