language_model_dict_content.h revision 063f86d40f2cb0d250b2166af8e1cf98ab135f8c
1dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi/* 2dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * Copyright (C) 2014, The Android Open Source Project 3dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * 4dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * you may not use this file except in compliance with the License. 6dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * You may obtain a copy of the License at 7dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * 8dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * 10dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * See the License for the specific language governing permissions and 14dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * limitations under the License. 15dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi */ 16dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi 17dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H 18dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H 19dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi 20c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi#include <cstdio> 21063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi#include <vector> 22c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 23dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#include "defines.h" 2408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" 2508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" 2608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" 27c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi#include "suggest/policyimpl/dictionary/utils/trie_map.h" 28c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi#include "utils/byte_array_view.h" 2908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "utils/int_array_view.h" 30dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi 31dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanaginamespace latinime { 32dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi 339aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagiclass HeaderPolicy; 349aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi 3508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi/** 3608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi * Class representing language model. 3708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi * 3808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi * This class provides methods to get and store unigram/n-gram probability information and flags. 3908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi */ 40dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagiclass LanguageModelDictContent { 41dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi public: 42c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi LanguageModelDictContent(const ReadWriteByteArrayView trieMapBuffer, 43c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi const bool hasHistoricalInfo) 4408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi : mTrieMap(trieMapBuffer), mHasHistoricalInfo(hasHistoricalInfo) {} 45c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 4608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi explicit LanguageModelDictContent(const bool hasHistoricalInfo) 4708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi : mTrieMap(), mHasHistoricalInfo(hasHistoricalInfo) {} 4808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 4908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi bool isNearSizeLimit() const { 5008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return mTrieMap.isNearSizeLimit(); 5108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 52c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 53c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi bool save(FILE *const file) const; 54dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi 5508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, 5608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const LanguageModelDictContent *const originalContent, 5708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi int *const outNgramCount); 5808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 59851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi ProbabilityEntry getProbabilityEntry(const int wordId) const { 60851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi return getNgramProbabilityEntry(WordIdArrayView(), wordId); 61851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi } 62851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi 63851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { 64851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); 65851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi } 66851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi 67b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi bool removeProbabilityEntry(const int wordId) { 68b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi return removeNgramProbabilityEntry(WordIdArrayView(), wordId); 69b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi } 70b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi 71851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, 72851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi const int wordId) const; 7308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 74851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, 7508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const ProbabilityEntry *const probabilityEntry); 7608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 77b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId); 78b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi 799aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi bool updateAllProbabilityEntries(const HeaderPolicy *const headerPolicy, 809aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi int *const outEntryCounts) { 81063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi for (int i = 0; i <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++i) { 82063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi outEntryCounts[i] = 0; 83063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi } 849aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi return updateAllProbabilityEntriesInner(mTrieMap.getRootBitmapEntryIndex(), 0 /* level */, 859aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi headerPolicy, outEntryCounts); 869aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi } 879aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi 88063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi // entryCounts should be created by updateAllProbabilityEntries. 89063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi bool truncateEntries(const int *const entryCounts, const int *const maxEntryCounts, 90063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const HeaderPolicy *const headerPolicy); 91063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 92dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi private: 93dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); 94c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi 95063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi class EntryInfoToTurncate { 96063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi public: 97063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi class Comparator { 98063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi public: 99063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi bool operator()(const EntryInfoToTurncate &left, 100063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const EntryInfoToTurncate &right) const; 101063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi private: 102063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi DISALLOW_ASSIGNMENT_OPERATOR(Comparator); 103063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi }; 104063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 105063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi EntryInfoToTurncate(const int probability, const int timestamp, const int key, 106063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const int entryLevel, const int *const prevWordIds); 107063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 108063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi int mProbability; 109063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi int mTimestamp; 110063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi int mKey; 111063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi int mEntryLevel; 112063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; 113063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 114063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi private: 115063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); 116063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi }; 117063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi 118c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi TrieMap mTrieMap; 11908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const bool mHasHistoricalInfo; 12008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 12108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, 12208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex, 12308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi int *const outNgramCount); 1249a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); 12503dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; 1269aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi bool updateAllProbabilityEntriesInner(const int bitmapEntryIndex, const int level, 1279aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi const HeaderPolicy *const headerPolicy, int *const outEntryCounts); 128063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, 129063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const int maxEntryCount, const int targetLevel); 130063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, 131063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi const int bitmapEntryIndex, std::vector<int> *const prevWordIds, 132063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi std::vector<EntryInfoToTurncate> *const outEntryInfo) const; 133dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi}; 134dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi} // namespace latinime 135dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */ 136