1c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi/* 2c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * Copyright (C) 2014, The Android Open Source Project 3c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * 4c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * you may not use this file except in compliance with the License. 6c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * You may obtain a copy of the License at 7c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * 8c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * 10c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * See the License for the specific language governing permissions and 14c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi * limitations under the License. 15c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi */ 16c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 1788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/content/language_model_dict_content.h" 18c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 19063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi#include <algorithm> 20063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi#include <cstring> 21063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 2288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" 2388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/probability_utils.h" 2478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi#include "utils/ngram_utils.h" 259aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi 26c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanaginamespace latinime { 27c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 286b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagiconst int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; 296b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagiconst int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; 30758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi 31c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagibool LanguageModelDictContent::save(FILE *const file) const { 326b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi return mTrieMap.save(file) && mGlobalCounters.save(file); 33c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi} 34c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 3508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagibool LanguageModelDictContent::runGC( 3608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, 3747fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi const LanguageModelDictContent *const originalContent) { 3808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), 3947fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi 0 /* nextLevelBitmapEntryIndex */); 4008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi} 4108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 427d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagiconst WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, 43bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const int wordId, const bool mustMatchAllPrevWords, 44bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const HeaderPolicy *const headerPolicy) const { 45395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; 46395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); 4772d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi int maxPrevWordCount = 0; 48395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi for (size_t i = 0; i < prevWordIds.size(); ++i) { 49395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi const int nextBitmapEntryIndex = 50395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex; 51395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) { 52395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi break; 53395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi } 5472d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi maxPrevWordCount = i + 1; 55395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; 56395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi } 57395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi 58bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); 59bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { 60bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi // The word should be treated as a invalid word. 61bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi return WordAttributes(); 62bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi } 6372d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi for (int i = maxPrevWordCount; i >= 0; --i) { 64bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) { 65bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi break; 66bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi } 67395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); 68395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi if (!result.mIsValid) { 69395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi continue; 70395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi } 7136ba139ca6a84c10a4635624c9c2861801f1e822Keisuke Kuroyanagi const ProbabilityEntry probabilityEntry = 7236ba139ca6a84c10a4635624c9c2861801f1e822Keisuke Kuroyanagi ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); 737d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi int probability = NOT_A_PROBABILITY; 74395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi if (mHasHistoricalInfo) { 75bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 76bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi int contextCount = 0; 7772d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi if (i == 0) { 7872d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi // unigram 79bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi contextCount = mGlobalCounters.getTotalCount(); 8072d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi } else { 8172d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( 8272d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); 8372d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi if (!prevWordProbabilityEntry.isValid()) { 8472d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi continue; 8572d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi } 86bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (prevWordProbabilityEntry.representsBeginningOfSentence() 87bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi && historicalInfo->getCount() == 1) { 88bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi // BoS ngram requires multiple contextCount. 89bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi continue; 9072d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi } 91bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); 9272d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi } 9378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1); 94bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const float rawProbability = 95bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( 9678212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi historicalInfo->getCount(), contextCount, ngramType); 97bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const int encodedRawProbability = 98bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi ProbabilityUtils::encodeRawProbability(rawProbability); 99bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const int decayedProbability = 100bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi DynamicLanguageModelProbabilityUtils::getDecayedProbability( 101bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi encodedRawProbability, *historicalInfo); 102bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi probability = DynamicLanguageModelProbabilityUtils::backoff( 10378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi decayedProbability, ngramType); 104395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi } else { 1057d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi probability = probabilityEntry.getProbability(); 106395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi } 1077d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in 1087d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi // probabilityEntry. 109b5ef884fbb6bfd08ce793604cdf7f0941e958a84Keisuke Kuroyanagi return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), 110b5ef884fbb6bfd08ce793604cdf7f0941e958a84Keisuke Kuroyanagi unigramProbabilityEntry.isNotAWord(), 1117d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi unigramProbabilityEntry.isPossiblyOffensive()); 112395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi } 113395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi // Cannot find the word. 1147d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi return WordAttributes(); 115395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi} 116395fe8e98dc102fcad52ef34d281e83e3cd13f46Keisuke Kuroyanagi 117851e0458fe460526b1f953e39a1e406a21ab4647Keisuke KuroyanagiProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( 11808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const WordIdArrayView prevWordIds, const int wordId) const { 11903dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); 12003dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { 12108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return ProbabilityEntry(); 12208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 12303dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); 12408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (!result.mIsValid) { 12508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi // Not found. 12608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return ProbabilityEntry(); 12708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 12808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); 12908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi} 13008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 131851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagibool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, 132d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi const int wordId, const ProbabilityEntry *const probabilityEntry) { 133d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { 134d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi return false; 135d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi } 1369a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); 13703dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { 13808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return false; 13908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 140d3097c67ca5da0f01d371f27d1b2dcdf32b80e3eKeisuke Kuroyanagi return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); 141b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi} 142b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi 143b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagibool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, 144b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi const int wordId) { 145b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); 146b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { 147b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi // Cannot find bitmap entry for the probability entry. The entry doesn't exist. 148b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi return false; 149b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi } 150b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi return mTrieMap.remove(wordId, bitmapEntryIndex); 15108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi} 15208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 15307b3b41c25e000615396399e484a041df9301449Keisuke KuroyanagiLanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries( 15407b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi const WordIdArrayView prevWordIds) const { 15507b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); 15607b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo); 15707b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi} 15807b3b41c25e000615396399e484a041df9301449Keisuke Kuroyanagi 159c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagistd::vector<LanguageModelDictContent::DumppedFullEntryInfo> 160c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi LanguageModelDictContent::exportAllNgramEntriesRelatedToWord( 161c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const HeaderPolicy *const headerPolicy, const int wordId) const { 162c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const TrieMap::Result result = mTrieMap.getRoot(wordId); 163c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) { 164c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi // The word doesn't have any related ngram entries. 165c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi return std::vector<DumppedFullEntryInfo>(); 166c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi } 167c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi std::vector<int> prevWordIds = { wordId }; 168c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi std::vector<DumppedFullEntryInfo> entries; 169c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex, 170c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi &prevWordIds, &entries); 171c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi return entries; 172c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi} 173c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi 174c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagivoid LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( 175c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex, 176c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi std::vector<int> *const prevWordIds, 177c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const { 178c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { 179c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const int wordId = entry.key(); 180c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const ProbabilityEntry probabilityEntry = 181c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); 182c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi if (probabilityEntry.isValid()) { 183c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const WordAttributes wordAttributes = getWordAttributes( 184bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, 185bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi headerPolicy); 186c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, 187c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi wordAttributes, probabilityEntry); 188c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi } 189c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi if (entry.hasNextLevelMap()) { 190c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi prevWordIds->push_back(wordId); 191c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi exportAllNgramEntriesRelatedToWordInner(headerPolicy, 192c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo); 193c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi prevWordIds->pop_back(); 194c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi } 195c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi } 196c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi} 197c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi 19847fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagibool LanguageModelDictContent::truncateEntries(const EntryCounts ¤tEntryCounts, 19947fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy, 20047fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi MutableEntryCounters *const outEntryCounters) { 20147fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) { 20247fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi const int totalWordCount = prevWordCount + 1; 20378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount); 20478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi if (currentEntryCounts.getNgramCount(ngramType) 20578212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi <= maxEntryCounts.getNgramCount(ngramType)) { 20678212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi outEntryCounters->setNgramCount(ngramType, 20778212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi currentEntryCounts.getNgramCount(ngramType)); 208063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi continue; 209063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 21047fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi int entryCount = 0; 21147fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi if (!turncateEntriesInSpecifiedLevel(headerPolicy, 21278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) { 213063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return false; 214063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 21578212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi outEntryCounters->setNgramCount(ngramType, entryCount); 216063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 217063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return true; 218063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi} 219063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 2205400701908262c929a77141cc84567646053d032Keisuke Kuroyanagibool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, 2215400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const int wordId, const bool isValid, const HistoricalInfo historicalInfo, 222e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { 2235400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi if (!mHasHistoricalInfo) { 2245400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); 2255400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi return false; 2265400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2275400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId); 2285400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom( 2295400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy); 2305400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) { 2315400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi return false; 2325400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2336b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi mGlobalCounters.incrementTotalCount(); 2346b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi mGlobalCounters.updateMaxValueOfCounters( 2356b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); 2365400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi for (size_t i = 0; i < prevWordIds.size(); ++i) { 2375400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi if (prevWordIds[i] == NOT_A_WORD_ID) { 2385400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi break; 2395400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2405400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi // TODO: Optimize this code. 2415400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1); 2425400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry( 2435400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi limitedPrevWordIds, wordId); 2445400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom( 2455400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy); 2465400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { 2475400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi return false; 2485400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2496b0561f9d26215209e8e8895f5c35982af5158f0Keisuke Kuroyanagi mGlobalCounters.updateMaxValueOfCounters( 250bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); 251e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi if (!originalNgramProbabilityEntry.isValid()) { 25278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi // (i + 2) words are used in total because the prevWords consists of (i + 1) words when 25378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi // looking at its i-th element. 25478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi entryCountersToUpdate->incrementNgramCount( 25578212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi NgramUtils::getNgramTypeFromWordCount(i + 2)); 2565400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2575400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2585400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi return true; 2595400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi} 2605400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi 2615400701908262c929a77141cc84567646053d032Keisuke Kuroyanagiconst ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( 2625400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const ProbabilityEntry &originalProbabilityEntry, const bool isValid, 2635400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { 264bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), 265bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() 266bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi + historicalInfo.getCount()); 2675400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi if (originalProbabilityEntry.isValid()) { 2685400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); 2695400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } else { 2705400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo); 2715400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi } 2725400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi} 2735400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi 27408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagibool LanguageModelDictContent::runGCInner( 27508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, 27647fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) { 27708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi for (auto &entry : trieMapRange) { 27808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const auto it = terminalIdMap->find(entry.key()); 27908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { 28008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi // The word has been removed. 28108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi continue; 28208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 28308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { 28408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return false; 28508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 28608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (entry.hasNextLevelMap()) { 28708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), 28847fc656cd79a59dab0b9c38cd15e3a66d25c267fKeisuke Kuroyanagi mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) { 28908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return false; 29008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 29108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 29208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 29308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return true; 29408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi} 29508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 2969a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagiint LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) { 297c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); 298c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi for (const int wordId : prevWordIds) { 299c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex); 300c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) { 301c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; 302c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi continue; 303c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi } 304c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi if (!result.mIsValid) { 305c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo), 306c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi lastBitmapEntryIndex)) { 307c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId, 308c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi lastBitmapEntryIndex); 309c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi return TrieMap::INVALID_INDEX; 310c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi } 3114926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi } 312c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId, 313c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi lastBitmapEntryIndex); 3144926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi } 315c9865785f41e3dcbe9308f653afc69603c1e44c0Keisuke Kuroyanagi return lastBitmapEntryIndex; 3169a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi} 3179a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi 31803dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagiint LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { 31903dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); 32003dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi for (const int wordId : prevWordIds) { 32103dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); 32203dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi if (!result.mIsValid) { 32303dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi return TrieMap::INVALID_INDEX; 32403dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi } 32503dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; 32603dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi } 32703dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi return bitmapEntryIndex; 32803dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi} 32903dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi 3305400701908262c929a77141cc84567646053d032Keisuke Kuroyanagibool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, 3313601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi const int prevWordCount, const HeaderPolicy *const headerPolicy, 332bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { 3339aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { 3343601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 3353601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", 3363601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); 3379aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi return false; 3389aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 3399aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi const ProbabilityEntry probabilityEntry = 3409aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); 3413601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi if (prevWordCount > 0 && probabilityEntry.isValid() 3423601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi && !mTrieMap.getRoot(entry.key()).mIsValid) { 3433601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi // The entry is related to a word that has been removed. Remove the entry. 3443601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { 3453601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi return false; 3463601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi } 3473601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi continue; 3483601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi } 349bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (mHasHistoricalInfo && probabilityEntry.isValid()) { 350bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); 351bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( 352bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi *originalHistoricalInfo)) { 3539aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi // Remove the entry. 3549aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { 3559aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi return false; 3569aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 3579aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi continue; 3589aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 359bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (needsToHalveCounters) { 360bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const int updatedCount = originalHistoricalInfo->getCount() / 2; 361bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (updatedCount == 0) { 362bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi // Remove the entry. 363bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { 364bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi return false; 365bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi } 366bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi continue; 367bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi } 368bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), 369bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi originalHistoricalInfo->getLevel(), updatedCount); 370bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), 371bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi &historicalInfoToSave); 372bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), 373bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi bitmapEntryIndex)) { 374bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi return false; 375bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi } 376bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi } 3779aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 37878212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi outEntryCounters->incrementNgramCount( 37978212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1)); 3809aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi if (!entry.hasNextLevelMap()) { 3819aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi continue; 3829aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 3833601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), 384bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { 3859aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi return false; 3869aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 3879aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 3889aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi return true; 3899aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi} 3909aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi 391063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagibool LanguageModelDictContent::turncateEntriesInSpecifiedLevel( 392758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, 393758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi int *const outEntryCount) { 394063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi std::vector<int> prevWordIds; 395063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi std::vector<EntryInfoToTurncate> entryInfoVector; 396063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(), 397063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi &prevWordIds, &entryInfoVector)) { 398063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return false; 399063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 400063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (static_cast<int>(entryInfoVector.size()) <= maxEntryCount) { 401758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi *outEntryCount = static_cast<int>(entryInfoVector.size()); 402063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return true; 403063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 404758d09364457b9d3d0c514a7fcfc8a6e317c9222Keisuke Kuroyanagi *outEntryCount = maxEntryCount; 405063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const int entryCountToRemove = static_cast<int>(entryInfoVector.size()) - maxEntryCount; 406063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove, 407063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi entryInfoVector.end(), 408063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi EntryInfoToTurncate::Comparator()); 409063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi for (int i = 0; i < entryCountToRemove; ++i) { 410063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const EntryInfoToTurncate &entryInfo = entryInfoVector[i]; 411063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (!removeNgramProbabilityEntry( 41278212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), 41378212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi entryInfo.mKey)) { 414063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return false; 415063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 416063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 417063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return true; 418063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi} 419063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 420063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagibool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy, 421063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds, 422063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi std::vector<EntryInfoToTurncate> *const outEntryInfo) const { 4233601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi const int prevWordCount = prevWordIds->size(); 424063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { 4253601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi if (prevWordCount < targetLevel) { 426063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (!entry.hasNextLevelMap()) { 427063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi continue; 428063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 429063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi prevWordIds->push_back(entry.key()); 430063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(), 431063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi prevWordIds, outEntryInfo)) { 432063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return false; 433063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 434063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi prevWordIds->pop_back(); 435063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi continue; 436063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 437063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const ProbabilityEntry probabilityEntry = 438063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); 439bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const int priority = mHasHistoricalInfo 440bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( 441bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi *probabilityEntry.getHistoricalInfo()) 442bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi : probabilityEntry.getProbability(); 443bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), 444063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi entry.key(), targetLevel, prevWordIds->data()); 445063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 446063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return true; 447063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi} 448063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 449063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagibool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( 450063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { 451bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (left.mPriority != right.mPriority) { 452bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi return left.mPriority < right.mPriority; 453063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 454bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi if (left.mCount != right.mCount) { 455bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi return left.mCount < right.mCount; 456063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 457063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (left.mKey != right.mKey) { 458063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return left.mKey < right.mKey; 459063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 4603601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi if (left.mPrevWordCount != right.mPrevWordCount) { 4613601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi return left.mPrevWordCount > right.mPrevWordCount; 462063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 4633601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi for (int i = 0; i < left.mPrevWordCount; ++i) { 464063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) { 465063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return left.mPrevWordIds[i] < right.mPrevWordIds[i]; 466063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 467063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 468063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi // left and rigth represent the same entry. 469063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi return false; 470063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi} 471063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 472bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke KuroyanagiLanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, 473bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi const int count, const int key, const int prevWordCount, const int *const prevWordIds) 474bcb52d73e206cee86a2ea126a5c3f948103057c6Keisuke Kuroyanagi : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { 4753601c214f80cf62eecacd84b2fb27fe9c6b14a19Keisuke Kuroyanagi memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); 476063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi} 477063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 478c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi} // namespace latinime 479