ver4_patricia_trie_node_writer.cpp revision 851e0458fe460526b1f953e39a1e406a21ab4647
1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
18
19#include "suggest/core/dictionary/property/unigram_property.h"
20#include "suggest/policyimpl/dictionary/header/header_policy.h"
21#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
22#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
23#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
24#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
25#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
26#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
27#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
28#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
29#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
30#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
31
32namespace latinime {
33
34const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3;
35
36bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
37        const PtNodeParams *const toBeUpdatedPtNodeParams) {
38    int pos = toBeUpdatedPtNodeParams->getHeadPos();
39    const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
40    const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
41    if (usesAdditionalBuffer) {
42        pos -= mTrieBuffer->getOriginalBufferSize();
43    }
44    // Read original flags
45    const PatriciaTrieReadingUtils::NodeFlags originalFlags =
46            PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
47    const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
48            DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
49                    true /* isDeleted */, false /* willBecomeNonTerminal */);
50    int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
51    // Update flags.
52    if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
53            &writingPos)) {
54        return false;
55    }
56    if (toBeUpdatedPtNodeParams->isTerminal()) {
57        // The PtNode is a terminal. Delete entry from the terminal position lookup table.
58        return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
59                toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
60    } else {
61        return true;
62    }
63}
64
65bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
66        const PtNodeParams *const toBeUpdatedPtNodeParams,
67        const int movedPos, const int bigramLinkedNodePos) {
68    int pos = toBeUpdatedPtNodeParams->getHeadPos();
69    const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
70    const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
71    if (usesAdditionalBuffer) {
72        pos -= mTrieBuffer->getOriginalBufferSize();
73    }
74    // Read original flags
75    const PatriciaTrieReadingUtils::NodeFlags originalFlags =
76            PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
77    const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
78            DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */,
79                    false /* isDeleted */, false /* willBecomeNonTerminal */);
80    int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
81    // Update flags.
82    if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
83            &writingPos)) {
84        return false;
85    }
86    // Update moved position, which is stored in the parent offset field.
87    if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
88            mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
89        return false;
90    }
91    if (toBeUpdatedPtNodeParams->hasChildren()) {
92        // Update children's parent position.
93        mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos());
94        while (!mReadingHelper.isEnd()) {
95            const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams());
96            int parentOffsetFieldPos = childPtNodeParams.getHeadPos()
97                    + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
98            if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
99                    mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(),
100                    &parentOffsetFieldPos)) {
101                // Parent offset cannot be written because of a bug or a broken dictionary; thus,
102                // we give up to update dictionary.
103                return false;
104            }
105            mReadingHelper.readNextSiblingNode(childPtNodeParams);
106        }
107    }
108    return true;
109}
110
111bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
112        const PtNodeParams *const toBeUpdatedPtNodeParams) {
113    int pos = toBeUpdatedPtNodeParams->getHeadPos();
114    const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
115    const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
116    if (usesAdditionalBuffer) {
117        pos -= mTrieBuffer->getOriginalBufferSize();
118    }
119    // Read original flags
120    const PatriciaTrieReadingUtils::NodeFlags originalFlags =
121            PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
122    const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
123            DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
124                    false /* isDeleted */, true /* willBecomeNonTerminal */);
125    if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
126            toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) {
127        AKLOGE("Cannot update terminal position lookup table. terminal id: %d",
128                toBeUpdatedPtNodeParams->getTerminalId());
129        return false;
130    }
131    // Update flags.
132    int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
133    return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
134            &writingPos);
135}
136
137bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
138        const PtNodeParams *const toBeUpdatedPtNodeParams,
139        const UnigramProperty *const unigramProperty) {
140    // Update probability and historical information.
141    // TODO: Update other information in the unigram property.
142    if (!toBeUpdatedPtNodeParams->isTerminal()) {
143        return false;
144    }
145    const ProbabilityEntry originalProbabilityEntry =
146            mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
147                    toBeUpdatedPtNodeParams->getTerminalId());
148    const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry,
149            unigramProperty);
150    return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
151            toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
152}
153
154bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
155        const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
156    if (!toBeUpdatedPtNodeParams->isTerminal()) {
157        AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
158        return false;
159    }
160    const ProbabilityEntry originalProbabilityEntry =
161            mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
162                    toBeUpdatedPtNodeParams->getTerminalId());
163    if (originalProbabilityEntry.hasHistoricalInfo()) {
164        const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
165                originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy);
166        const ProbabilityEntry probabilityEntry =
167                originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
168        if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
169                toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
170            AKLOGE("Cannot write updated probability entry. terminalId: %d",
171                    toBeUpdatedPtNodeParams->getTerminalId());
172            return false;
173        }
174        const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
175        if (!isValid) {
176            if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
177                AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
178                return false;
179            }
180        }
181        *outNeedsToKeepPtNode = isValid;
182    } else {
183        // No need to update probability.
184        *outNeedsToKeepPtNode = true;
185    }
186    return true;
187}
188
189bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
190        const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) {
191    int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos();
192    return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
193            newChildrenPosition, &childrenPosFieldPos);
194}
195
196bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
197        const int newTerminalId) {
198    return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
199            toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
200}
201
202bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
203        const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
204    return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */,
205            ptNodeWritingPos);
206}
207
208
209bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
210        const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
211        int *const ptNodeWritingPos) {
212    int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
213    if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId,
214            ptNodeWritingPos)) {
215        return false;
216    }
217    // Write probability.
218    ProbabilityEntry newProbabilityEntry;
219    const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom(
220            &newProbabilityEntry, unigramProperty);
221    return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
222            terminalId, &probabilityEntryToWrite);
223}
224
225bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry(
226        const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam,
227        const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
228    if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(),
229            targetPtNodeParam->getTerminalId(), bigramProperty, outAddedNewBigram)) {
230        AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d",
231                sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId());
232        return false;
233    }
234    return true;
235}
236
237bool Ver4PatriciaTrieNodeWriter::removeBigramEntry(
238        const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam) {
239    return mBigramPolicy->removeEntry(sourcePtNodeParams->getTerminalId(),
240            targetPtNodeParam->getTerminalId());
241}
242
243bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
244            const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
245    return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
246            sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
247}
248
249bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
250        const PtNodeParams *const toBeUpdatedPtNodeParams,
251        const DictPositionRelocationMap *const dictPositionRelocationMap,
252        int *const outBigramEntryCount) {
253    int parentPos = toBeUpdatedPtNodeParams->getParentPos();
254    if (parentPos != NOT_A_DICT_POS) {
255        PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
256                dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
257        if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
258            parentPos = it->second;
259        }
260    }
261    int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
262            + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
263    // Write updated parent offset.
264    if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
265            parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
266        return false;
267    }
268
269    // Updates children position.
270    int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
271    if (childrenPos != NOT_A_DICT_POS) {
272        PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
273                dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
274        if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
275            childrenPos = it->second;
276        }
277    }
278    if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
279        return false;
280    }
281
282    // Counts bigram entries.
283    if (outBigramEntryCount) {
284        *outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
285                toBeUpdatedPtNodeParams->getTerminalId());
286    }
287    return true;
288}
289
290bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
291        const int *const targetCodePoints, const int targetCodePointCount,
292        const int shortcutProbability) {
293    if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
294            targetCodePoints, targetCodePointCount, shortcutProbability)) {
295        AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId());
296        return false;
297    }
298    return true;
299}
300
301bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
302        const PtNodeParams *const ptNodeParams, int *const outTerminalId,
303        int *const ptNodeWritingPos) {
304    const int nodePos = *ptNodeWritingPos;
305    // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the
306    // PtNode writing.
307    if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer,
308            0 /* nodeFlags */, ptNodeWritingPos)) {
309        return false;
310    }
311    // Calculate a parent offset and write the offset.
312    if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
313            ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) {
314        return false;
315    }
316    // Write code points
317    if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer,
318            ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) {
319        return false;
320    }
321    int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
322    if (!ptNodeParams->willBecomeNonTerminal()) {
323        if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) {
324            terminalId = ptNodeParams->getTerminalId();
325        } else if (ptNodeParams->isTerminal()) {
326            // Write terminal information using a new terminal id.
327            // Get a new unused terminal id.
328            terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId();
329        }
330    }
331    const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
332    if (isTerminal) {
333        // Update the lookup table.
334        if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
335                terminalId, nodePos)) {
336            return false;
337        }
338        // Write terminal Id.
339        if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId,
340                Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) {
341            return false;
342        }
343        if (outTerminalId) {
344            *outTerminalId = terminalId;
345        }
346    }
347    // Write children position
348    if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
349            ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
350        return false;
351    }
352    return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
353            isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
354}
355
356const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
357        const ProbabilityEntry *const originalProbabilityEntry,
358        const UnigramProperty *const unigramProperty) const {
359    // TODO: Consolidate historical info and probability.
360    if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
361        const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(),
362                unigramProperty->getLevel(), unigramProperty->getCount());
363        const HistoricalInfo updatedHistoricalInfo =
364                ForgettingCurveUtils::createUpdatedHistoricalInfo(
365                        originalProbabilityEntry->getHistoricalInfo(),
366                        unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy);
367        return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
368                &updatedHistoricalInfo);
369    } else {
370        return originalProbabilityEntry->createEntryWithUpdatedProbability(
371                unigramProperty->getProbability());
372    }
373}
374
375bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos,
376        const bool isBlacklisted, const bool isNotAWord, const bool isTerminal,
377        const bool hasMultipleChars) {
378    // Create node flags and write them.
379    PatriciaTrieReadingUtils::NodeFlags nodeFlags =
380            PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal,
381                    false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars,
382                    CHILDREN_POSITION_FIELD_SIZE);
383    if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
384        AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
385        return false;
386    }
387    return true;
388}
389
390} // namespace latinime
391