16bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi/*
26bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * Copyright (C) 2013, The Android Open Source Project
36bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi *
46bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
56bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * you may not use this file except in compliance with the License.
66bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * You may obtain a copy of the License at
76bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi *
86bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi *     http://www.apache.org/licenses/LICENSE-2.0
96bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi *
106bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
116bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
126bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
136bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * See the License for the specific language governing permissions and
146bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * limitations under the License.
156bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi */
166bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
176bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi/*
186bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * !!!!! DO NOT EDIT THIS FILE !!!!!
196bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi *
206bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi * This file was generated from
2188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi *   dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
226bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi */
236bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
256bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
266bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi#include <cstring>
276bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi#include <queue>
286bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/header/header_policy.h"
3088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
3188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
3288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
3388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
3488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
3588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
3688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
3788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/buffer_with_extendable_buffer.h"
3888bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/file_utils.h"
3988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/forgetting_curve_utils.h"
406bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
416bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanaginamespace latinime {
426bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanaginamespace backward {
4307e14126318f7661f76fdce421d723d64e7ea8deKeisuke Kuroyanaginamespace v402 {
446bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
456bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagibool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
46e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi        const EntryCounts &entryCounts) const {
476bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
486bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    BufferWithExtendableBuffer headerBuffer(
496bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
506bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
516bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
526bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
53e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi            entryCounts, extendedRegionSize, &headerBuffer)) {
546bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        AKLOGE("Cannot write header structure to buffer. "
556bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
5678212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
5778212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
586bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
596bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
606bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
616bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi}
626bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
636bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagibool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
646bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const char *const dictDirPath) {
656bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
666bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
676bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
686bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    Ver4DictConstants::MAX_DICTIONARY_SIZE));
696bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    int unigramCount = 0;
706bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    int bigramCount = 0;
716bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
726bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
736bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
746bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    BufferWithExtendableBuffer headerBuffer(
756bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
7678212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi    MutableEntryCounters entryCounters;
7778212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi    entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
7878212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi    entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
796bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
8078212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi            entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
816bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
826bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
836bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
846bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi}
856bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
866bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagibool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
876bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
886bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        int *const outUnigramCount, int *const outBigramCount) {
896bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
906bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getProbabilityDictContent(), headerPolicy);
916bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
926bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
936bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getTerminalPositionLookupTable(), headerPolicy);
946bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
956bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getTerminalPositionLookupTable());
966bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
976bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
986bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &shortcutPolicy);
996bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
1006bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
1016bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
1026bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    DynamicPtGcEventListeners
1036bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
1046bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
1056bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                            &ptNodeWriter);
1066bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
1076bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
1086bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1096bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1106bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
1116bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            .getValidUnigramCount();
11278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi    const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
1136bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
1146bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
1156bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
1166bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    maxUnigramCount);
1176bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            return false;
1186bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
1196bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1206bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
1216bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
1226bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
1236bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            traversePolicyToUpdateBigramProbability(&ptNodeWriter);
1246bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
1256bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &traversePolicyToUpdateBigramProbability)) {
1266bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1276bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1286bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
12978212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi    const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
1306bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
1316bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        if (!truncateBigrams(maxBigramCount)) {
1326bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
1336bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            return false;
1346bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
1356bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1366bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
1376bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Mapping from positions in mBuffer to positions in bufferToWrite.
1386bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
1396bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
1406bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
1416bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
1426bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &shortcutPolicy);
1436bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
1446bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
1456bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
1466bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
1476bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
1486bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1496bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1506bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
1516bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Create policy instances for the GCed dictionary.
1526bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
1536bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            buffersToWrite->getProbabilityDictContent(), headerPolicy);
1546bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
1556bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
1566bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
1576bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
1586bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            buffersToWrite->getTerminalPositionLookupTable());
1596bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
1606bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
1616bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &newShortcutPolicy);
1626bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Re-assign terminal IDs for valid terminal PtNodes.
1636bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
1646bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
1656bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &terminalIdMap)) {
1666bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1676bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1686bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Run GC for probability dict content.
1696bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
1706bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getProbabilityDictContent())) {
1716bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1726bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1736bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Run GC for bigram dict content.
1746bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
1756bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getBigramDictContent(), outBigramCount)) {
1766bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1776bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1786bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Run GC for shortcut dict content.
1796bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
1806bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getShortcutDictContent())) {
1816bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1826bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1836bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
1846bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
1856bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
1866bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
1876bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
1886bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &traversePolicyToUpdateAllPositionFields)) {
1896bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1906bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1916bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
1926bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
1936bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
1946bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
1956bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
1966bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
1976bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
1986bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
1996bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    return true;
2006bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi}
2016bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2026bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagibool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
2036bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const Ver4PatriciaTrieNodeReader *const ptNodeReader,
2046bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
2056bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const TerminalPositionLookupTable *const terminalPosLookupTable =
2066bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getTerminalPositionLookupTable();
2076bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
2086bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
2096bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            priorityQueue;
2106bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    for (int i = 0; i < nextTerminalId; ++i) {
2116bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
2126bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        if (terminalPos == NOT_A_DICT_POS) {
2136bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            continue;
2146bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
2156bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const ProbabilityEntry probabilityEntry =
2166bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
2176bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const int probability = probabilityEntry.hasHistoricalInfo() ?
2186bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                ForgettingCurveUtils::decodeProbability(
2196bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                        probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
2206bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                probabilityEntry.getProbability();
2216bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        priorityQueue.push(DictProbability(terminalPos, probability,
222287e155e44b4e937f2a62d010805702bc813c43bKeisuke Kuroyanagi                probabilityEntry.getHistoricalInfo()->getTimestamp()));
2236bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
2246bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2256bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Delete unigrams.
2266bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
2276bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const int ptNodePos = priorityQueue.top().getDictPos();
22807e14126318f7661f76fdce421d723d64e7ea8deKeisuke Kuroyanagi        priorityQueue.pop();
2296bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const PtNodeParams ptNodeParams =
2300fbca1ac2388db81a443c1705732130564c3f714Keisuke Kuroyanagi                ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
23107e14126318f7661f76fdce421d723d64e7ea8deKeisuke Kuroyanagi        if (ptNodeParams.representsNonWordInfo()) {
23207e14126318f7661f76fdce421d723d64e7ea8deKeisuke Kuroyanagi            continue;
23307e14126318f7661f76fdce421d723d64e7ea8deKeisuke Kuroyanagi        }
2346bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
2356bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
2366bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            return false;
2376bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
2386bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
2396bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    return true;
2406bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi}
2416bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2426bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagibool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
2436bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const TerminalPositionLookupTable *const terminalPosLookupTable =
2446bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mBuffers->getTerminalPositionLookupTable();
2456bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
2466bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
2476bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            priorityQueue;
2486bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
2496bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    for (int i = 0; i < nextTerminalId; ++i) {
2506bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
2516bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        if (bigramListPos == NOT_A_DICT_POS) {
2526bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            continue;
2536bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
2546bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        bool hasNext = true;
2556bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        int readingPos = bigramListPos;
2566bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        while (hasNext) {
2576bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            const int entryPos = readingPos;
2586bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            const BigramEntry bigramEntry =
2596bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
2606bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            hasNext = bigramEntry.hasNext();
2616bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            if (!bigramEntry.isValid()) {
2626bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                continue;
2636bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            }
2646bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            const int probability = bigramEntry.hasHistoricalInfo() ?
2656bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    ForgettingCurveUtils::decodeProbability(
2666bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                            bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
2676bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                    bigramEntry.getProbability();
2686bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            priorityQueue.push(DictProbability(entryPos, probability,
269287e155e44b4e937f2a62d010805702bc813c43bKeisuke Kuroyanagi                    bigramEntry.getHistoricalInfo()->getTimestamp()));
2706bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
2716bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
2726bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2736bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    // Delete bigrams.
2746bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
2756bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const int entryPos = priorityQueue.top().getDictPos();
2766bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
2776bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
2786bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
2796bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
2806bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            return false;
2816bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        }
2826bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        priorityQueue.pop();
2836bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
2846bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    return true;
2856bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi}
2866bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
2876bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagibool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
2886bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
2896bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!ptNodeParams->isTerminal()) {
2906bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return true;
2916bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
2926bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
2936bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi            mTerminalIdMap->find(ptNodeParams->getTerminalId());
2946bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (it == mTerminalIdMap->end()) {
2956bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
2966bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi                ptNodeParams->getTerminalId(), mTerminalIdMap->size());
2976bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        return false;
2986bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
2996bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
3006bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi        AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
3016bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    }
3026bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi    return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
3036bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi}
3046bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi
30507e14126318f7661f76fdce421d723d64e7ea8deKeisuke Kuroyanagi} // namespace v402
3066bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi} // namespace backward
3076bf268132d60061fd26bd8cba63a12b56b22056eKeisuke Kuroyanagi} // namespace latinime
308