1c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi/* 2c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project 3c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * 4c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License"); 5c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * you may not use this file except in compliance with the License. 6c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * You may obtain a copy of the License at 7c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * 8c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * http://www.apache.org/licenses/LICENSE-2.0 9c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * 10c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software 11c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS, 12c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * See the License for the specific language governing permissions and 14c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * limitations under the License. 15c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi */ 16c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 17c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#ifndef LATINIME_PATRICIA_TRIE_POLICY_H 18c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#define LATINIME_PATRICIA_TRIE_POLICY_H 19c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 20cafab169cdb21244c82b99c09983c98066113d87Ken Wakasa#include <cstdint> 210fc93fe4455f24809f6c9baf0d3b936519779cfbKeisuke Kuroyanagi#include <vector> 22e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi 23c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#include "defines.h" 2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/header/header_policy.h" 2588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" 2688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/bigram/bigram_list_policy.h" 2788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h" 2888bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" 2988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" 3088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/format_utils.h" 3188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/mmapped_buffer.h" 32c0c674cdc0721a374e140ad5ee1409c0498b3262Keisuke Kuroyanagi#include "utils/byte_array_view.h" 336ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h" 34c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 35c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynaginamespace latinime { 36c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 37e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagiclass DicNode; 38e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagiclass DicNodeVector; 39e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi 407d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Position of a PtNode that represents the word. 4179bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi// Max supported n-gram is bigram. 42e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagiclass PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { 43c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi public: 444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) 454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi : mMmappedBuffer(std::move(mmappedBuffer)), 46c0c674cdc0721a374e140ad5ee1409c0498b3262Keisuke Kuroyanagi mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), 475a20827fc7ea9425d6a1d336681e1b1cc1c7d0a3Keisuke Kuroyanagi FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())), 48180e7b4c07f8c17862b9bdcdee0bc4b4413328a9Keisuke Kuroyanagi mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), 4959ebd5171820785f4c7b893f48df8b9d65cc4902Keisuke Kuroyanagi mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), 50fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy, 51fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto mHeaderPolicy.getCodePointTable()), 526258c57c323e35819262fc0ce0c7ce60ea492b77Keisuke Kuroyanagi mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), 536258c57c323e35819262fc0ce0c7ce60ea492b77Keisuke Kuroyanagi mIsCorrupted(false) {} 54e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi 55c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi AK_FORCE_INLINE int getRootPosition() const { 56c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi return 0; 57c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi } 58c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void createAndGetAllChildDicNodes(const DicNode *const dicNode, 607fd9667d76cdc6febe32545865648ea90dc28904Keisuke Kuroyanagi DicNodeVector *const childDicNodes) const; 61c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 6265a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, 6365a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int *const outCodePoints) const; 64c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 6589a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; 66c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 67537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, 68537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi const int wordId, MultiBigramMap *const multiBigramMap) const; 699f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi 7065d19946bebd1cc6299e2789cc0fc097d1898e80Keisuke Kuroyanagi int getProbability(const int unigramProbability, const int bigramProbability) const; 7165d19946bebd1cc6299e2789cc0fc097d1898e80Keisuke Kuroyanagi 72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; 73c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi void iterateNgramEntries(const WordIdArrayView prevWordIds, 75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi NgramListener *const listener) const; 762d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi 77ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; 78c38ec475962ad6b7f14abe35b950545ebcdbe3c5Keisuke Kuroynagi 7976e579c7caf2ef04f440be21c27377fe0b4150ffKeisuke Kuroyanagi const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { 8076e579c7caf2ef04f440be21c27377fe0b4150ffKeisuke Kuroyanagi return &mHeaderPolicy; 81d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi } 82d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi 836ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool addUnigramEntry(const CodePointArrayView wordCodePoints, 84793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi const UnigramProperty *const unigramProperty) { 850624cc6cf3971aa3c189185208571a5f3d0c459cKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 869f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 8766facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi return false; 8866facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi } 8966facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi 906ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi bool removeUnigramEntry(const CodePointArrayView wordCodePoints) { 91f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 92f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); 93f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi return false; 94f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi } 95f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi 9688bb28c132d87f15a52e9a0b8a45950f39eb19adKeisuke Kuroyanagi bool addNgramEntry(const NgramProperty *const ngramProperty) { 970624cc6cf3971aa3c189185208571a5f3d0c459cKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 989f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 9966facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi return false; 10066facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi } 10166facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi 10272e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi bool removeNgramEntry(const NgramContext *const ngramContext, 1036ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const CodePointArrayView wordCodePoints) { 1040624cc6cf3971aa3c189185208571a5f3d0c459cKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 1059f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 10666facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi return false; 10766facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi } 10866facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi 10972e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, 11029777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi const CodePointArrayView wordCodePoints, const bool isValidWord, 11129777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi const HistoricalInfo historicalInfo) { 11229777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 113ab4437f4681db3fb8327f752df8f08bd0d8cf967Keisuke Kuroyanagi AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " 114ab4437f4681db3fb8327f752df8f08bd0d8cf967Keisuke Kuroyanagi "dictionary."); 11529777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi return false; 11629777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi } 11729777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi 118dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flush(const char *const filePath) { 119d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 120d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi AKLOGI("Warning: flush() is called for non-updatable dictionary."); 121dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi return false; 122d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi } 123d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi 124dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi bool flushWithGC(const char *const filePath) { 125d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 126d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 127dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi return false; 128d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi } 129d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi 130c18510049a3422c88ed3ab3bbc64944c94a611fdKeisuke Kuroyanagi bool needsToRunGC(const bool mindsBlockByGC) const { 131d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi // This method should not be called for non-updatable dictionary. 132d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 133d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi return false; 134d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi } 135d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi 1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa void getProperty(const char *const query, const int queryLength, char *const outResult, 137699531099630edd8416e309c914187c285af4c44Keisuke Kuroyanagi const int maxResultLength) { 13831097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi // getProperty is not supported for this class. 13931097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi if (maxResultLength > 0) { 14031097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi outResult[0] = '\0'; 14131097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi } 14231097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi } 14331097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi 1446ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; 1452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 146f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int getNextWordAndNextToken(const int token, int *const outCodePoints, 147f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi int *const outCodePointCount); 14838f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi 149b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi bool isCorrupted() const { 150b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi return mIsCorrupted; 151b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi } 152b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi 153c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi private: 154e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); 155c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi 1562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; 15776e579c7caf2ef04f440be21c27377fe0b4150ffKeisuke Kuroyanagi const HeaderPolicy mHeaderPolicy; 158180e7b4c07f8c17862b9bdcdee0bc4b4413328a9Keisuke Kuroyanagi const ReadOnlyByteArrayView mBuffer; 159668870be431d17ee4ceb5ce161aee1189063af18Keisuke Kuroyanagi const BigramListPolicy mBigramListPolicy; 160fd10db04e02ddad88d0c6fca82583493955a7c7eKeisuke Kuroyanagi const ShortcutListPolicy mShortcutListPolicy; 1611e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi const Ver2ParticiaTrieNodeReader mPtNodeReader; 162be6117058840492c2862f8ae9f7dc95c29f3a8f3Keisuke Kuroyanagi const Ver2PtNodeArrayReader mPtNodeArrayReader; 1630fc93fe4455f24809f6c9baf0d3b936519779cfbKeisuke Kuroyanagi std::vector<int> mTerminalPtNodePositionsForIteratingWords; 164b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi mutable bool mIsCorrupted; 1651fb11da36ab279fa4fcc62d772d9cce877bf23bdKeisuke Kuroynagi 16665a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId, 16765a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi const int maxCodePointCount, int *const outCodePoints, 16865a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi int *const outUnigramProbability) const; 169847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi int getShortcutPositionOfPtNode(const int ptNodePos) const; 170b00973952f269ebee6d1d5f808fad7ca64fb9954Keisuke Kuroyanagi int getBigramsPositionOfPtNode(const int ptNodePos) const; 17177ef75cbe6722d1eb45115c1ad82f963444d71cdKeisuke Kuroyanagi int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, 1727fd9667d76cdc6febe32545865648ea90dc28904Keisuke Kuroyanagi DicNodeVector *const childDicNodes) const; 17389a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; 17489a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi int getTerminalPtNodePosFromWordId(const int wordId) const; 1752111e3abc9c9c0ea0350b8470532bf636b78cdd7Keisuke Kuroyanagi const WordAttributes getWordAttributes(const int probability, 1762111e3abc9c9c0ea0350b8470532bf636b78cdd7Keisuke Kuroyanagi const PtNodeParams &ptNodeParams) const; 177180e7b4c07f8c17862b9bdcdee0bc4b4413328a9Keisuke Kuroyanagi bool isValidPos(const int pos) const; 178c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi}; 179c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi} // namespace latinime 180c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#endif // LATINIME_PATRICIA_TRIE_POLICY_H 181