ver4_patricia_trie_policy.h revision 537f6eea8a8d56fe532913a37f4dbff4b3d490af
12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/* 22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project 32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License"); 52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License. 62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at 72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * http://www.apache.org/licenses/LICENSE-2.0 92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software 112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS, 122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and 142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License. 152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */ 162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H 182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H 192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 20941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi#include <vector> 21941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi 222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h" 232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/header/header_policy.h" 252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" 266e4b674f83e0c287e00bfe6546db2a1f93daf5f0Keisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" 272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" 282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" 302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" 311d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" 322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" 336ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h" 342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime { 362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNode; 382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNodeVector; 392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 407d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Artificial id that is stored in the PtNode looked up by the word. 412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { 422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public: 434ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) 444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), 454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mDictBuffer(mBuffers->getWritableTrieBuffer()), 464ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), 474ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mBuffers->getTerminalPositionLookupTable()), 48851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi mNodeReader(mDictBuffer, mBuffers->getLanguageModelDictContent(), mHeaderPolicy), 491d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi mPtNodeArrayReader(mDictBuffer), 5057816c7a8bac1a47913da7a503ece2b5dd7cc0fcKeisuke Kuroyanagi mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, 51fe395232d69df0887863c1cbabe63def2586d29eKeisuke Kuroyanagi &mPtNodeArrayReader, &mShortcutPolicy), 522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), 532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mWritingHelper(mBuffers.get()), 542fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mUnigramCount(mHeaderPolicy->getUnigramCount()), 55941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi mBigramCount(mHeaderPolicy->getBigramCount()), 56b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; 572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa AK_FORCE_INLINE int getRootPosition() const { 592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return 0; 602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void createAndGetAllChildDicNodes(const DicNode *const dicNode, 632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DicNodeVector *const childDicNodes) const; 642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int getCodePointsAndProbabilityAndReturnCodePointCount( 6694e4cd25a8f7417d30a0832f7476d39ece1df788Keisuke Kuroyanagi const int wordId, const int maxCodePointCount, int *const outCodePoints, 672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int *const outUnigramProbability) const; 682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 6989a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; 702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 71537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, 72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const int wordId, MultiBigramMap *const multiBigramMap) const; 739f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi 74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi // TODO: Remove 75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbability(const int unigramProbability, const int bigramProbability) const { 76537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi // Not used. 77537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi return NOT_A_PROBABILITY; 78537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi } 792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 80537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; 812fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 82537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi void iterateNgramEntries(const WordIdArrayView prevWordIds, 83537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi NgramListener *const listener) const; 842d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi 85ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; 862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { 882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mHeaderPolicy; 892fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 902fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 916ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool addUnigramEntry(const CodePointArrayView wordCodePoints, 92793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi const UnigramProperty *const unigramProperty); 932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 946ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool removeUnigramEntry(const CodePointArrayView wordCodePoints); 95f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi 969f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, 97620a05ae59ec9f7be39557094fc306c51c712ca1Keisuke Kuroyanagi const BigramProperty *const bigramProperty); 982fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 996ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, 1006ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const CodePointArrayView wordCodePoints); 1012fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 102dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flush(const char *const filePath); 1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 104dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flushWithGC(const char *const filePath); 1052fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bool needsToRunGC(const bool mindsBlockByGC) const; 1072fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1082fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void getProperty(const char *const query, const int queryLength, char *const outResult, 1092fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int maxResultLength); 1102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1116ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; 1122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 113f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int getNextWordAndNextToken(const int token, int *const outCodePoints, 114f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int *const outCodePointCount); 11538f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi 116b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi bool isCorrupted() const { 117b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi return mIsCorrupted; 118b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi } 119b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi 1202fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private: 1212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); 1222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const UNIGRAM_COUNT_QUERY; 1242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const BIGRAM_COUNT_QUERY; 1252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const MAX_UNIGRAM_COUNT_QUERY; 1262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const MAX_BIGRAM_COUNT_QUERY; 1272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // When the dictionary size is near the maximum size, we have to refuse dynamic operations to 1282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // prevent the dictionary from overflowing. 1292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 1302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; 1312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1324ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; 1332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const HeaderPolicy *const mHeaderPolicy; 1342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa BufferWithExtendableBuffer *const mDictBuffer; 1352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4ShortcutListPolicy mShortcutPolicy; 1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieNodeReader mNodeReader; 1371d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi Ver4PtNodeArrayReader mPtNodeArrayReader; 1382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieNodeWriter mNodeWriter; 1392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DynamicPtUpdatingHelper mUpdatingHelper; 1402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieWritingHelper mWritingHelper; 1412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int mUnigramCount; 1422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int mBigramCount; 143941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi std::vector<int> mTerminalPtNodePositionsForIteratingWords; 144b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mutable bool mIsCorrupted; 145847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi 146ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi int getShortcutPositionOfWord(const int wordId) const; 1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}; 1482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime 1492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H 150