12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/* 22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project 32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License"); 52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License. 62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at 72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * http://www.apache.org/licenses/LICENSE-2.0 92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software 112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS, 122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and 142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License. 152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */ 162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H 182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H 192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 20941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi#include <vector> 21941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi 222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h" 2388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/header/header_policy.h" 2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" 2588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" 2688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" 2788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_dict_buffers.h" 2888bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 2988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" 3088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" 3188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" 3288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/buffer_with_extendable_buffer.h" 3388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/entry_counters.h" 346ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h" 352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime { 372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNode; 392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNodeVector; 402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 417d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Artificial id that is stored in the PtNode looked up by the word. 422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { 432fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public: 444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) 454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), 464ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mDictBuffer(mBuffers->getWritableTrieBuffer()), 474ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), 484ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi mBuffers->getTerminalPositionLookupTable()), 49cb4f544198e0592e3e4bb96f1592bc0bd2beb6edKeisuke Kuroyanagi mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer), 505400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, 515400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi &mShortcutPolicy), 522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), 532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mWritingHelper(mBuffers.get()), 5478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), 55b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; 562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa AK_FORCE_INLINE int getRootPosition() const { 582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return 0; 592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void createAndGetAllChildDicNodes(const DicNode *const dicNode, 622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DicNodeVector *const childDicNodes) const; 632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 6465a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, 6565a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int *const outCodePoints) const; 662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 6789a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; 682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 69537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, 70537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const int wordId, MultiBigramMap *const multiBigramMap) const; 719f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi 72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi // TODO: Remove 73537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbability(const int unigramProbability, const int bigramProbability) const { 74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi // Not used. 75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi return NOT_A_PROBABILITY; 76537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi } 772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 78537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; 792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 80537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi void iterateNgramEntries(const WordIdArrayView prevWordIds, 81537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi NgramListener *const listener) const; 822d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi 83ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; 842fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 852fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { 862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mHeaderPolicy; 872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 896ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool addUnigramEntry(const CodePointArrayView wordCodePoints, 90793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi const UnigramProperty *const unigramProperty); 912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 926ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool removeUnigramEntry(const CodePointArrayView wordCodePoints); 93f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi 9488bb28c132d87f15a52e9a0b8a45950f39eb19adKeisuke Kuroyanagi bool addNgramEntry(const NgramProperty *const ngramProperty); 952fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 9672e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi bool removeNgramEntry(const NgramContext *const ngramContext, 976ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const CodePointArrayView wordCodePoints); 982fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 9972e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, 10029777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi const CodePointArrayView wordCodePoints, const bool isValidWord, 10129777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi const HistoricalInfo historicalInfo); 10229777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi 103dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flush(const char *const filePath); 1042fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 105dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flushWithGC(const char *const filePath); 1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1072fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bool needsToRunGC(const bool mindsBlockByGC) const; 1082fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1092fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void getProperty(const char *const query, const int queryLength, char *const outResult, 1102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int maxResultLength); 1112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1126ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; 1132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 114f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int getNextWordAndNextToken(const int token, int *const outCodePoints, 115f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int *const outCodePointCount); 11638f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi 117b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi bool isCorrupted() const { 118b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi return mIsCorrupted; 119b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi } 120b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi 1212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private: 1222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); 1232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const UNIGRAM_COUNT_QUERY; 1252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const BIGRAM_COUNT_QUERY; 1262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const MAX_UNIGRAM_COUNT_QUERY; 1272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const char *const MAX_BIGRAM_COUNT_QUERY; 1282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // When the dictionary size is near the maximum size, we have to refuse dynamic operations to 1292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // prevent the dictionary from overflowing. 1302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 1312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; 1322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1334ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; 1342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const HeaderPolicy *const mHeaderPolicy; 1352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa BufferWithExtendableBuffer *const mDictBuffer; 1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4ShortcutListPolicy mShortcutPolicy; 1372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieNodeReader mNodeReader; 1381d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi Ver4PtNodeArrayReader mPtNodeArrayReader; 1392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieNodeWriter mNodeWriter; 1402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DynamicPtUpdatingHelper mUpdatingHelper; 1412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Ver4PatriciaTrieWritingHelper mWritingHelper; 142e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi MutableEntryCounters mEntryCounters; 143941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi std::vector<int> mTerminalPtNodePositionsForIteratingWords; 144b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mutable bool mIsCorrupted; 145847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi 146ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi int getShortcutPositionOfWord(const int wordId) const; 1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}; 1482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime 1492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H 150