ver4_patricia_trie_policy.h revision cb4f544198e0592e3e4bb96f1592bc0bd2beb6ed
12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/* 22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project 32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License"); 52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License. 62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at 72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * http://www.apache.org/licenses/LICENSE-2.0 92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software 112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS, 122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and 142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License. 152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */ 162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H 182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H 192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 20941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi#include <vector> 21941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi 222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h" 232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/header/header_policy.h" 252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" 266e4b674f83e0c287e00bfe6546db2a1f93daf5f0Keisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" 272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" 282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" 302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" 311d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" 322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" 336ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h" 342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime { 362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNode; 382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNodeVector; 392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 407d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Artificial id that is stored in the PtNode looked up by the word. 412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { 422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public: 434ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) 444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), 454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mDictBuffer(mBuffers->getWritableTrieBuffer()), 464ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), 474ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mBuffers->getTerminalPositionLookupTable()), 48cb4f544198e0592e3e4bb96f1592bc0bd2beb6edKeisuke Kuroyanagi mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer), 4957816c7a8bac1a47913da7a503ece2b5dd7cc0fcKeisuke Kuroyanagi mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, 50fe395232d69df0887863c1cbabe63def2586d29eKeisuke Kuroyanagi &mPtNodeArrayReader, &mShortcutPolicy), 512fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), 522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mWritingHelper(mBuffers.get()), 532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mUnigramCount(mHeaderPolicy->getUnigramCount()), 54941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi mBigramCount(mHeaderPolicy->getBigramCount()), 55b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; 562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa AK_FORCE_INLINE int getRootPosition() const { 582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return 0; 592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void createAndGetAllChildDicNodes(const DicNode *const dicNode, 622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DicNodeVector *const childDicNodes) const; 632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 6465a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, 6565a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int *const outCodePoints) const; 662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 6789a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; 682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 69537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, 70537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const int wordId, MultiBigramMap *const multiBigramMap) const; 719f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi 72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi // TODO: Remove 73537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbability(const int unigramProbability, const int bigramProbability) const { 74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi // Not used. 75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi return NOT_A_PROBABILITY; 76537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi } 772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 78537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; 792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 80537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi void iterateNgramEntries(const WordIdArrayView prevWordIds, 81537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi NgramListener *const listener) const; 822d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi 83ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; 842fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 852fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { 862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mHeaderPolicy; 872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 896ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool addUnigramEntry(const CodePointArrayView wordCodePoints, 90793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi const UnigramProperty *const unigramProperty); 912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 926ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool removeUnigramEntry(const CodePointArrayView wordCodePoints); 93f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi 949f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, 95620a05ae59ec9f7be39557094fc306c51c712ca1Keisuke Kuroyanagi const BigramProperty *const bigramProperty); 962fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 976ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, 986ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const CodePointArrayView wordCodePoints); 992fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 100dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flush(const char *const filePath); 1012fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 102dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flushWithGC(const char *const filePath); 1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1042fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bool needsToRunGC(const bool mindsBlockByGC) const; 1052fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void getProperty(const char *const query, const int queryLength, char *const outResult, 1072fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int maxResultLength); 1082fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1096ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; 1102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 111f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int getNextWordAndNextToken(const int token, int *const outCodePoints, 112f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int *const outCodePointCount); 11338f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi 114b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi bool isCorrupted() const { 115b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi return mIsCorrupted; 116b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi } 117b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi 1182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private: 1192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); 1202fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const UNIGRAM_COUNT_QUERY; 1222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const BIGRAM_COUNT_QUERY; 1232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const MAX_UNIGRAM_COUNT_QUERY; 1242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const MAX_BIGRAM_COUNT_QUERY; 1252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // When the dictionary size is near the maximum size, we have to refuse dynamic operations to 1262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // prevent the dictionary from overflowing. 1272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 1282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; 1292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1304ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; 1312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const HeaderPolicy *const mHeaderPolicy; 1322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa BufferWithExtendableBuffer *const mDictBuffer; 1332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4ShortcutListPolicy mShortcutPolicy; 1342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieNodeReader mNodeReader; 1351d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi Ver4PtNodeArrayReader mPtNodeArrayReader; 1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieNodeWriter mNodeWriter; 1372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DynamicPtUpdatingHelper mUpdatingHelper; 1382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieWritingHelper mWritingHelper; 1392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int mUnigramCount; 1402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int mBigramCount; 141941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi std::vector<int> mTerminalPtNodePositionsForIteratingWords; 142b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mutable bool mIsCorrupted; 143847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi 144ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi int getShortcutPositionOfWord(const int wordId) const; 1452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}; 1462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime 1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H 148