1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * !!!!! DO NOT EDIT THIS FILE !!!!!
19 *
20 * This file was generated from
21 *   suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
22 */
23
24#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
25
26#include <cstring>
27#include <queue>
28
29#include "suggest/policyimpl/dictionary/header/header_policy.h"
30#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
31#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
32#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
33#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
34#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
35#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
36#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
37#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
38#include "suggest/policyimpl/dictionary/utils/file_utils.h"
39#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
40
41namespace latinime {
42namespace backward {
43namespace v402 {
44
45bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
46        const int unigramCount, const int bigramCount) const {
47    const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
48    BufferWithExtendableBuffer headerBuffer(
49            BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
50    const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
51            + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
52    if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
53            unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) {
54        AKLOGE("Cannot write header structure to buffer. "
55                "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
56                "extendedRegionSize: %d", false, unigramCount, bigramCount,
57                extendedRegionSize);
58        return false;
59    }
60    return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
61}
62
63bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
64        const char *const dictDirPath) {
65    const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
66    Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
67            Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
68                    Ver4DictConstants::MAX_DICTIONARY_SIZE));
69    int unigramCount = 0;
70    int bigramCount = 0;
71    if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
72        return false;
73    }
74    BufferWithExtendableBuffer headerBuffer(
75            BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
76    if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
77            unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) {
78        return false;
79    }
80    return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
81}
82
83bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
84        const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
85        int *const outUnigramCount, int *const outBigramCount) {
86    Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
87            mBuffers->getProbabilityDictContent(), headerPolicy);
88    Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
89    Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
90            mBuffers->getTerminalPositionLookupTable(), headerPolicy);
91    Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
92            mBuffers->getTerminalPositionLookupTable());
93    Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
94            mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
95            &shortcutPolicy);
96
97    DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
98    readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
99    DynamicPtGcEventListeners
100            ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
101                    traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
102                            &ptNodeWriter);
103    if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
104            &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
105        return false;
106    }
107    const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
108            .getValidUnigramCount();
109    const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
110    if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
111        if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
112            AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
113                    maxUnigramCount);
114            return false;
115        }
116    }
117
118    readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
119    DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
120            traversePolicyToUpdateBigramProbability(&ptNodeWriter);
121    if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
122            &traversePolicyToUpdateBigramProbability)) {
123        return false;
124    }
125    const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
126    const int maxBigramCount = headerPolicy->getMaxBigramCount();
127    if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
128        if (!truncateBigrams(maxBigramCount)) {
129            AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
130            return false;
131        }
132    }
133
134    // Mapping from positions in mBuffer to positions in bufferToWrite.
135    PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
136    readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
137    Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
138            buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
139            &shortcutPolicy);
140    DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
141            traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
142                    buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
143    if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
144            &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
145        return false;
146    }
147
148    // Create policy instances for the GCed dictionary.
149    Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
150            buffersToWrite->getProbabilityDictContent(), headerPolicy);
151    Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
152    Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
153            buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
154    Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
155            buffersToWrite->getTerminalPositionLookupTable());
156    Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
157            buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
158            &newShortcutPolicy);
159    // Re-assign terminal IDs for valid terminal PtNodes.
160    TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
161    if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
162            &terminalIdMap)) {
163        return false;
164    }
165    // Run GC for probability dict content.
166    if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
167            mBuffers->getProbabilityDictContent())) {
168        return false;
169    }
170    // Run GC for bigram dict content.
171    if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
172            mBuffers->getBigramDictContent(), outBigramCount)) {
173        return false;
174    }
175    // Run GC for shortcut dict content.
176    if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
177            mBuffers->getShortcutDictContent())) {
178        return false;
179    }
180    DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
181    newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
182    DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
183            traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
184    if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
185            &traversePolicyToUpdateAllPositionFields)) {
186        return false;
187    }
188    newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
189    TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
190            traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
191    if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
192            &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
193        return false;
194    }
195    *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
196    return true;
197}
198
199bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
200        const Ver4PatriciaTrieNodeReader *const ptNodeReader,
201        Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
202    const TerminalPositionLookupTable *const terminalPosLookupTable =
203            mBuffers->getTerminalPositionLookupTable();
204    const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
205    std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
206            priorityQueue;
207    for (int i = 0; i < nextTerminalId; ++i) {
208        const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
209        if (terminalPos == NOT_A_DICT_POS) {
210            continue;
211        }
212        const ProbabilityEntry probabilityEntry =
213                mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
214        const int probability = probabilityEntry.hasHistoricalInfo() ?
215                ForgettingCurveUtils::decodeProbability(
216                        probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
217                probabilityEntry.getProbability();
218        priorityQueue.push(DictProbability(terminalPos, probability,
219                probabilityEntry.getHistoricalInfo()->getTimeStamp()));
220    }
221
222    // Delete unigrams.
223    while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
224        const int ptNodePos = priorityQueue.top().getDictPos();
225        priorityQueue.pop();
226        const PtNodeParams ptNodeParams =
227                ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
228        if (ptNodeParams.representsNonWordInfo()) {
229            continue;
230        }
231        if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
232            AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
233            return false;
234        }
235    }
236    return true;
237}
238
239bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
240    const TerminalPositionLookupTable *const terminalPosLookupTable =
241            mBuffers->getTerminalPositionLookupTable();
242    const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
243    std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
244            priorityQueue;
245    BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
246    for (int i = 0; i < nextTerminalId; ++i) {
247        const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
248        if (bigramListPos == NOT_A_DICT_POS) {
249            continue;
250        }
251        bool hasNext = true;
252        int readingPos = bigramListPos;
253        while (hasNext) {
254            const int entryPos = readingPos;
255            const BigramEntry bigramEntry =
256                    bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
257            hasNext = bigramEntry.hasNext();
258            if (!bigramEntry.isValid()) {
259                continue;
260            }
261            const int probability = bigramEntry.hasHistoricalInfo() ?
262                    ForgettingCurveUtils::decodeProbability(
263                            bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
264                    bigramEntry.getProbability();
265            priorityQueue.push(DictProbability(entryPos, probability,
266                    bigramEntry.getHistoricalInfo()->getTimeStamp()));
267        }
268    }
269
270    // Delete bigrams.
271    while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
272        const int entryPos = priorityQueue.top().getDictPos();
273        const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
274        const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
275        if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
276            AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
277            return false;
278        }
279        priorityQueue.pop();
280    }
281    return true;
282}
283
284bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
285        ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
286    if (!ptNodeParams->isTerminal()) {
287        return true;
288    }
289    TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
290            mTerminalIdMap->find(ptNodeParams->getTerminalId());
291    if (it == mTerminalIdMap->end()) {
292        AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
293                ptNodeParams->getTerminalId(), mTerminalIdMap->size());
294        return false;
295    }
296    if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
297        AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
298    }
299    return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
300}
301
302} // namespace v402
303} // namespace backward
304} // namespace latinime
305