1c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi/*
2c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * Copyright (C) 2014, The Android Open Source Project
3c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi *
4c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * you may not use this file except in compliance with the License.
6c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * You may obtain a copy of the License at
7c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi *
8c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi *     http://www.apache.org/licenses/LICENSE-2.0
9c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi *
10c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * See the License for the specific language governing permissions and
14c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * limitations under the License.
15c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi */
16c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
1788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/content/language_model_dict_content.h"
18c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
19063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi#include <algorithm>
20063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi#include <cstring>
21063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
2288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
2388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/probability_utils.h"
2478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi#include "utils/ngram_utils.h"
259aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi
26c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanaginamespace latinime {
27c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
286b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagiconst int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0;
296b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagiconst int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1;
30758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi
31c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagibool LanguageModelDictContent::save(FILE *const file) const {
326b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi    return mTrieMap.save(file) && mGlobalCounters.save(file);
33c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi}
34c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
3508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagibool LanguageModelDictContent::runGC(
3608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
3747fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        const LanguageModelDictContent *const originalContent) {
3808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(),
3947fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi            0 /* nextLevelBitmapEntryIndex */);
4008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi}
4108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
427d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagiconst WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds,
43bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        const int wordId, const bool mustMatchAllPrevWords,
44bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        const HeaderPolicy *const headerPolicy) const {
45395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi    int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
46395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi    bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex();
4772d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi    int maxPrevWordCount = 0;
48395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi    for (size_t i = 0; i < prevWordIds.size(); ++i) {
49395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        const int nextBitmapEntryIndex =
50395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi                mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex;
51395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) {
52395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi            break;
53395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        }
5472d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi        maxPrevWordCount = i + 1;
55395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        bitmapEntryIndices[i + 1] = nextBitmapEntryIndex;
56395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi    }
57395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi
58bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi    const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
59bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi    if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) {
60bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        // The word should be treated as a invalid word.
61bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        return WordAttributes();
62bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi    }
6372d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi    for (int i = maxPrevWordCount; i >= 0; --i) {
64bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) {
65bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            break;
66bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        }
67395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]);
68395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        if (!result.mIsValid) {
69395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi            continue;
70395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        }
7136ba139ca6a84c10a4635624c9c2861801f1e822Keisuke Kuroyanagi        const ProbabilityEntry probabilityEntry =
7236ba139ca6a84c10a4635624c9c2861801f1e822Keisuke Kuroyanagi                ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
737d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        int probability = NOT_A_PROBABILITY;
74395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        if (mHasHistoricalInfo) {
75bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
76bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            int contextCount = 0;
7772d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi            if (i == 0) {
7872d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                // unigram
79bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                contextCount = mGlobalCounters.getTotalCount();
8072d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi            } else {
8172d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry(
8272d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                        prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]);
8372d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                if (!prevWordProbabilityEntry.isValid()) {
8472d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                    continue;
8572d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                }
86bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                if (prevWordProbabilityEntry.representsBeginningOfSentence()
87bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                        && historicalInfo->getCount() == 1) {
88bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    // BoS ngram requires multiple contextCount.
89bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    continue;
9072d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                }
91bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount();
9272d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi            }
9378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi            const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1);
94bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            const float rawProbability =
95bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts(
9678212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                            historicalInfo->getCount(), contextCount, ngramType);
97bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            const int encodedRawProbability =
98bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    ProbabilityUtils::encodeRawProbability(rawProbability);
99bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            const int decayedProbability =
100bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    DynamicLanguageModelProbabilityUtils::getDecayedProbability(
101bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                            encodedRawProbability, *historicalInfo);
102bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            probability = DynamicLanguageModelProbabilityUtils::backoff(
10378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                    decayedProbability, ngramType);
104395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        } else {
1057d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            probability = probabilityEntry.getProbability();
106395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi        }
1077d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
1087d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        // probabilityEntry.
109b5ef884fbb6bfd08ce793604cdf7f0941e958a84Keisuke Kuroyanagi        return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
110b5ef884fbb6bfd08ce793604cdf7f0941e958a84Keisuke Kuroyanagi                unigramProbabilityEntry.isNotAWord(),
1117d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi                unigramProbabilityEntry.isPossiblyOffensive());
112395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi    }
113395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi    // Cannot find the word.
1147d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    return WordAttributes();
115395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi}
116395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi
117851e0458fe460526b1f953e39a1e406a21ab4647Keisuke KuroyanagiProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry(
11808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        const WordIdArrayView prevWordIds, const int wordId) const {
11903dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
12003dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
12108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        return ProbabilityEntry();
12208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
12303dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
12408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    if (!result.mIsValid) {
12508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        // Not found.
12608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        return ProbabilityEntry();
12708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
12808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
12908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi}
13008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
131851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagibool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds,
132d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi        const int wordId, const ProbabilityEntry *const probabilityEntry) {
133d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi    if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
134d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi        return false;
135d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi    }
1369a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds);
13703dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
13808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        return false;
13908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
140d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi    return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex);
141b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi}
142b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi
143b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagibool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds,
144b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi        const int wordId) {
145b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
146b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
147b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi        // Cannot find bitmap entry for the probability entry. The entry doesn't exist.
148b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi        return false;
149b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    }
150b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    return mTrieMap.remove(wordId, bitmapEntryIndex);
15108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi}
15208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
15307b3b41c25e000615396399e484a041df9301449Keisuke KuroyanagiLanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries(
15407b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi        const WordIdArrayView prevWordIds) const {
15507b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi    const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
15607b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi    return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo);
15707b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi}
15807b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi
159c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagistd::vector<LanguageModelDictContent::DumppedFullEntryInfo>
160c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        LanguageModelDictContent::exportAllNgramEntriesRelatedToWord(
161c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                const HeaderPolicy *const headerPolicy, const int wordId) const {
162c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    const TrieMap::Result result = mTrieMap.getRoot(wordId);
163c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) {
164c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        // The word doesn't have any related ngram entries.
165c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        return std::vector<DumppedFullEntryInfo>();
166c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    }
167c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    std::vector<int> prevWordIds = { wordId };
168c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    std::vector<DumppedFullEntryInfo> entries;
169c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex,
170c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            &prevWordIds, &entries);
171c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    return entries;
172c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi}
173c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi
174c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagivoid LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner(
175c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex,
176c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        std::vector<int> *const prevWordIds,
177c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const {
178c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
179c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        const int wordId = entry.key();
180c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        const ProbabilityEntry probabilityEntry =
181c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
182c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        if (probabilityEntry.isValid()) {
183c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            const WordAttributes wordAttributes = getWordAttributes(
184bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */,
185bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    headerPolicy);
186c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId,
187c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                    wordAttributes, probabilityEntry);
188c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        }
189c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        if (entry.hasNextLevelMap()) {
190c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            prevWordIds->push_back(wordId);
191c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            exportAllNgramEntriesRelatedToWordInner(headerPolicy,
192c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                    entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo);
193c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            prevWordIds->pop_back();
194c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        }
195c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    }
196c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi}
197c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi
19847fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagibool LanguageModelDictContent::truncateEntries(const EntryCounts &currentEntryCounts,
19947fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy,
20047fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        MutableEntryCounters *const outEntryCounters) {
20147fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi    for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) {
20247fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        const int totalWordCount = prevWordCount + 1;
20378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi        const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount);
20478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi        if (currentEntryCounts.getNgramCount(ngramType)
20578212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                <= maxEntryCounts.getNgramCount(ngramType)) {
20678212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi            outEntryCounters->setNgramCount(ngramType,
20778212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                    currentEntryCounts.getNgramCount(ngramType));
208063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            continue;
209063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        }
21047fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        int entryCount = 0;
21147fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        if (!turncateEntriesInSpecifiedLevel(headerPolicy,
21278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) {
213063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            return false;
214063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        }
21578212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi        outEntryCounters->setNgramCount(ngramType, entryCount);
216063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
217063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    return true;
218063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi}
219063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
2205400701908262c929a77141cc84567646053d032Keisuke Kuroyanagibool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds,
2215400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        const int wordId, const bool isValid, const HistoricalInfo historicalInfo,
222e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi        const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) {
2235400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    if (!mHasHistoricalInfo) {
2245400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info.");
2255400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        return false;
2265400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    }
2275400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId);
2285400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom(
2295400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi            originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy);
2305400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) {
2315400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        return false;
2325400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    }
2336b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi    mGlobalCounters.incrementTotalCount();
2346b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi    mGlobalCounters.updateMaxValueOfCounters(
2356b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi            updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount());
2365400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    for (size_t i = 0; i < prevWordIds.size(); ++i) {
2375400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        if (prevWordIds[i] == NOT_A_WORD_ID) {
2385400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi            break;
2395400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        }
2405400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        // TODO: Optimize this code.
2415400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1);
2425400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry(
2435400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi                limitedPrevWordIds, wordId);
2445400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom(
2455400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi                originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy);
2465400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) {
2475400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi            return false;
2485400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        }
2496b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi        mGlobalCounters.updateMaxValueOfCounters(
250bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                updatedNgramProbabilityEntry.getHistoricalInfo()->getCount());
251e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi        if (!originalNgramProbabilityEntry.isValid()) {
25278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi            // (i + 2) words are used in total because the prevWords consists of (i + 1) words when
25378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi            // looking at its i-th element.
25478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi            entryCountersToUpdate->incrementNgramCount(
25578212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                    NgramUtils::getNgramTypeFromWordCount(i + 2));
2565400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        }
2575400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    }
2585400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    return true;
2595400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi}
2605400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi
2615400701908262c929a77141cc84567646053d032Keisuke Kuroyanagiconst ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom(
2625400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        const ProbabilityEntry &originalProbabilityEntry, const bool isValid,
2635400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const {
264bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi    const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(),
265bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount()
266bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    + historicalInfo.getCount());
2675400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    if (originalProbabilityEntry.isValid()) {
2685400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo);
2695400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    } else {
2705400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi        return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo);
2715400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi    }
2725400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi}
2735400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi
27408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagibool LanguageModelDictContent::runGCInner(
27508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
27647fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi        const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) {
27708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    for (auto &entry : trieMapRange) {
27808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        const auto it = terminalIdMap->find(entry.key());
27908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) {
28008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            // The word has been removed.
28108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            continue;
28208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        }
28308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) {
28408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            return false;
28508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        }
28608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        if (entry.hasNextLevelMap()) {
28708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(),
28847fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi                    mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) {
28908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                return false;
29008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            }
29108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        }
29208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
29308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    return true;
29408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi}
29508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
2969a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagiint LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) {
297c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
298c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    for (const int wordId : prevWordIds) {
299c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex);
300c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) {
301c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
302c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            continue;
303c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        }
304c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        if (!result.mIsValid) {
305c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo),
306c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                    lastBitmapEntryIndex)) {
307c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId,
308c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                        lastBitmapEntryIndex);
309c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                return TrieMap::INVALID_INDEX;
310c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi            }
3114926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi        }
312c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi        lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId,
313c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi                lastBitmapEntryIndex);
3144926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi    }
315c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi    return lastBitmapEntryIndex;
3169a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi}
3179a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi
31803dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagiint LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const {
31903dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
32003dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    for (const int wordId : prevWordIds) {
32103dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi        const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
32203dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi        if (!result.mIsValid) {
32303dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi            return TrieMap::INVALID_INDEX;
32403dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi        }
32503dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi        bitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
32603dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    }
32703dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    return bitmapEntryIndex;
32803dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi}
32903dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi
3305400701908262c929a77141cc84567646053d032Keisuke Kuroyanagibool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
3313601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        const int prevWordCount, const HeaderPolicy *const headerPolicy,
332bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) {
3339aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi    for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
3343601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
3353601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi            AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
3363601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi                    prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM);
3379aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi            return false;
3389aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        }
3399aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        const ProbabilityEntry probabilityEntry =
3409aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
3413601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        if (prevWordCount > 0 && probabilityEntry.isValid()
3423601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi                && !mTrieMap.getRoot(entry.key()).mIsValid) {
3433601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi            // The entry is related to a word that has been removed. Remove the entry.
3443601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi            if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
3453601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi                return false;
3463601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi            }
3473601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi            continue;
3483601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        }
349bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        if (mHasHistoricalInfo && probabilityEntry.isValid()) {
350bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo();
351bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC(
352bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    *originalHistoricalInfo)) {
3539aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                // Remove the entry.
3549aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
3559aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                    return false;
3569aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                }
3579aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                continue;
3589aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi            }
359bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            if (needsToHalveCounters) {
360bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                const int updatedCount = originalHistoricalInfo->getCount() / 2;
361bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                if (updatedCount == 0) {
362bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    // Remove the entry.
363bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
364bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                        return false;
365bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    }
366bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    continue;
367bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                }
368bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(),
369bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                        originalHistoricalInfo->getLevel(), updatedCount);
370bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(),
371bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                        &historicalInfoToSave);
372bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo),
373bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                        bitmapEntryIndex)) {
374bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                    return false;
375bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                }
376bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi            }
3779aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        }
37878212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi        outEntryCounters->incrementNgramCount(
37978212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1));
3809aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        if (!entry.hasNextLevelMap()) {
3819aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi            continue;
3829aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        }
3833601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(),
384bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) {
3859aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi            return false;
3869aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        }
3879aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi    }
3889aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi    return true;
3899aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi}
3909aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi
391063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagibool LanguageModelDictContent::turncateEntriesInSpecifiedLevel(
392758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi        const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel,
393758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi        int *const outEntryCount) {
394063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    std::vector<int> prevWordIds;
395063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    std::vector<EntryInfoToTurncate> entryInfoVector;
396063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(),
397063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            &prevWordIds, &entryInfoVector)) {
398063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        return false;
399063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
400063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    if (static_cast<int>(entryInfoVector.size()) <= maxEntryCount) {
401758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi        *outEntryCount = static_cast<int>(entryInfoVector.size());
402063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        return true;
403063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
404758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi    *outEntryCount = maxEntryCount;
405063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    const int entryCountToRemove = static_cast<int>(entryInfoVector.size()) - maxEntryCount;
406063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove,
407063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            entryInfoVector.end(),
408063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            EntryInfoToTurncate::Comparator());
409063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    for (int i = 0; i < entryCountToRemove; ++i) {
410063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        const EntryInfoToTurncate &entryInfo = entryInfoVector[i];
411063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        if (!removeNgramProbabilityEntry(
41278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount),
41378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi                entryInfo.mKey)) {
414063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            return false;
415063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        }
416063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
417063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    return true;
418063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi}
419063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
420063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagibool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy,
421063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        const int targetLevel, const int bitmapEntryIndex,  std::vector<int> *const prevWordIds,
422063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        std::vector<EntryInfoToTurncate> *const outEntryInfo) const {
4233601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi    const int prevWordCount = prevWordIds->size();
424063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
4253601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        if (prevWordCount < targetLevel) {
426063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            if (!entry.hasNextLevelMap()) {
427063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                continue;
428063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            }
429063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            prevWordIds->push_back(entry.key());
430063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(),
431063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                    prevWordIds, outEntryInfo)) {
432063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                return false;
433063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            }
434063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            prevWordIds->pop_back();
435063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            continue;
436063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        }
437063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        const ProbabilityEntry probabilityEntry =
438063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
439bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        const int priority = mHasHistoricalInfo
440bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction(
441bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                        *probabilityEntry.getHistoricalInfo())
442bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi                : probabilityEntry.getProbability();
443bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(),
444063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                entry.key(), targetLevel, prevWordIds->data());
445063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
446063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    return true;
447063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi}
448063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
449063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagibool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
450063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const {
451bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi    if (left.mPriority != right.mPriority) {
452bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        return left.mPriority < right.mPriority;
453063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
454bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi    if (left.mCount != right.mCount) {
455bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        return left.mCount < right.mCount;
456063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
457063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    if (left.mKey != right.mKey) {
458063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        return left.mKey < right.mKey;
459063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
4603601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi    if (left.mPrevWordCount != right.mPrevWordCount) {
4613601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi        return left.mPrevWordCount > right.mPrevWordCount;
462063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
4633601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi    for (int i = 0; i < left.mPrevWordCount; ++i) {
464063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) {
465063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            return left.mPrevWordIds[i] < right.mPrevWordIds[i];
466063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        }
467063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    }
468063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    // left and rigth represent the same entry.
469063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    return false;
470063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi}
471063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
472bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke KuroyanagiLanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority,
473bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        const int count, const int key, const int prevWordCount, const int *const prevWordIds)
474bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi        : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) {
4753601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi    memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0]));
476063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi}
477063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
478c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi} // namespace latinime
479