1c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi/*
2c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project
3c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi *
4c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License");
5c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * you may not use this file except in compliance with the License.
6c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * You may obtain a copy of the License at
7c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi *
8c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi *     http://www.apache.org/licenses/LICENSE-2.0
9c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi *
10c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software
11c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS,
12c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * See the License for the specific language governing permissions and
14c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi * limitations under the License.
15c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi */
16c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
17c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#ifndef LATINIME_PATRICIA_TRIE_POLICY_H
18c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#define LATINIME_PATRICIA_TRIE_POLICY_H
19c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
20cafab169cdb21244c82b99c09983c98066113d87Ken Wakasa#include <cstdint>
210fc93fe4455f24809f6c9baf0d3b936519779cfbKeisuke Kuroyanagi#include <vector>
22e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi
23c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#include "defines.h"
2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/header/header_policy.h"
2588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
2688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/bigram/bigram_list_policy.h"
2788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h"
2888bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
2988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v2/ver2_pt_node_array_reader.h"
3088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/format_utils.h"
3188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/mmapped_buffer.h"
32c0c674cdc0721a374e140ad5ee1409c0498b3262Keisuke Kuroyanagi#include "utils/byte_array_view.h"
336ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h"
34c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
35c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynaginamespace latinime {
36c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
37e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagiclass DicNode;
38e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagiclass DicNodeVector;
39e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi
407d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Position of a PtNode that represents the word.
4179bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi// Max supported n-gram is bigram.
42e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagiclass PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
43c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi public:
444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi            : mMmappedBuffer(std::move(mmappedBuffer)),
46c0c674cdc0721a374e140ad5ee1409c0498b3262Keisuke Kuroyanagi              mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
475a20827fc7ea9425d6a1d336681e1b1cc1c7d0a3Keisuke Kuroyanagi                      FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())),
48180e7b4c07f8c17862b9bdcdee0bc4b4413328a9Keisuke Kuroyanagi              mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
4959ebd5171820785f4c7b893f48df8b9d65cc4902Keisuke Kuroyanagi              mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
50fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto              mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
51fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto                      mHeaderPolicy.getCodePointTable()),
526258c57c323e35819262fc0ce0c7ce60ea492b77Keisuke Kuroyanagi              mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
536258c57c323e35819262fc0ce0c7ce60ea492b77Keisuke Kuroyanagi              mIsCorrupted(false) {}
54e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi
55c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi    AK_FORCE_INLINE int getRootPosition() const {
56c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi        return 0;
57c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi    }
58c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void createAndGetAllChildDicNodes(const DicNode *const dicNode,
607fd9667d76cdc6febe32545865648ea90dc28904Keisuke Kuroyanagi            DicNodeVector *const childDicNodes) const;
61c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
6265a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi    int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
6365a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi            int *const outCodePoints) const;
64c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
6589a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi    int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
66c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
67537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
68537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            const int wordId, MultiBigramMap *const multiBigramMap) const;
699f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi
7065d19946bebd1cc6299e2789cc0fc097d1898e80Keisuke Kuroyanagi    int getProbability(const int unigramProbability, const int bigramProbability) const;
7165d19946bebd1cc6299e2789cc0fc097d1898e80Keisuke Kuroyanagi
72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
73c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    void iterateNgramEntries(const WordIdArrayView prevWordIds,
75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            NgramListener *const listener) const;
762d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi
77ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
78c38ec475962ad6b7f14abe35b950545ebcdbe3c5Keisuke Kuroynagi
7976e579c7caf2ef04f440be21c27377fe0b4150ffKeisuke Kuroyanagi    const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
8076e579c7caf2ef04f440be21c27377fe0b4150ffKeisuke Kuroyanagi        return &mHeaderPolicy;
81d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi    }
82d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi
836ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool addUnigramEntry(const CodePointArrayView wordCodePoints,
84793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi            const UnigramProperty *const unigramProperty) {
850624cc6cf3971aa3c189185208571a5f3d0c459cKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
869f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
8766facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi        return false;
8866facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi    }
8966facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi
906ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool removeUnigramEntry(const CodePointArrayView wordCodePoints) {
91f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
92f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi        AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
93f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi        return false;
94f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi    }
95f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi
9688bb28c132d87f15a52e9a0b8a45950f39eb19adKeisuke Kuroyanagi    bool addNgramEntry(const NgramProperty *const ngramProperty) {
970624cc6cf3971aa3c189185208571a5f3d0c459cKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
989f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
9966facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi        return false;
10066facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi    }
10166facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi
10272e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi    bool removeNgramEntry(const NgramContext *const ngramContext,
1036ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi            const CodePointArrayView wordCodePoints) {
1040624cc6cf3971aa3c189185208571a5f3d0c459cKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
1059f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi        AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
10666facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi        return false;
10766facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi    }
10866facd37ddf8fc23ed2508a114c446147aaca724Keisuke Kuroyanagi
10972e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi    bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
11029777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi            const CodePointArrayView wordCodePoints, const bool isValidWord,
11129777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi            const HistoricalInfo historicalInfo) {
11229777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
113ab4437f4681db3fb8327f752df8f08bd0d8cf967Keisuke Kuroyanagi        AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
114ab4437f4681db3fb8327f752df8f08bd0d8cf967Keisuke Kuroyanagi                "dictionary.");
11529777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi        return false;
11629777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi    }
11729777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi
118dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flush(const char *const filePath) {
119d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
120d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        AKLOGI("Warning: flush() is called for non-updatable dictionary.");
121dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi        return false;
122d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi    }
123d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi
124dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flushWithGC(const char *const filePath) {
125d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
126d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
127dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi        return false;
128d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi    }
129d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi
130c18510049a3422c88ed3ab3bbc64944c94a611fdKeisuke Kuroyanagi    bool needsToRunGC(const bool mindsBlockByGC) const {
131d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        // This method should not be called for non-updatable dictionary.
132d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
133d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi        return false;
134d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi    }
135d0246277fde27e9c40a270e206f1d106811e847fKeisuke Kuroyanagi
1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void getProperty(const char *const query, const int queryLength, char *const outResult,
137699531099630edd8416e309c914187c285af4c44Keisuke Kuroyanagi            const int maxResultLength) {
13831097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi        // getProperty is not supported for this class.
13931097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi        if (maxResultLength > 0) {
14031097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi            outResult[0] = '\0';
14131097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi        }
14231097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi    }
14331097a57cc6f8022abc0ea56f27147399f41b630Keisuke Kuroyanagi
1446ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
1452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
146f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi    int getNextWordAndNextToken(const int token, int *const outCodePoints,
147f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi            int *const outCodePointCount);
14838f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi
149b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    bool isCorrupted() const {
150b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi        return mIsCorrupted;
151b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    }
152b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi
153c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi private:
154e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi    DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
155c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi
1562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const MmappedBuffer::MmappedBufferPtr mMmappedBuffer;
15776e579c7caf2ef04f440be21c27377fe0b4150ffKeisuke Kuroyanagi    const HeaderPolicy mHeaderPolicy;
158180e7b4c07f8c17862b9bdcdee0bc4b4413328a9Keisuke Kuroyanagi    const ReadOnlyByteArrayView mBuffer;
159668870be431d17ee4ceb5ce161aee1189063af18Keisuke Kuroyanagi    const BigramListPolicy mBigramListPolicy;
160fd10db04e02ddad88d0c6fca82583493955a7c7eKeisuke Kuroyanagi    const ShortcutListPolicy mShortcutListPolicy;
1611e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    const Ver2ParticiaTrieNodeReader mPtNodeReader;
162be6117058840492c2862f8ae9f7dc95c29f3a8f3Keisuke Kuroyanagi    const Ver2PtNodeArrayReader mPtNodeArrayReader;
1630fc93fe4455f24809f6c9baf0d3b936519779cfbKeisuke Kuroyanagi    std::vector<int> mTerminalPtNodePositionsForIteratingWords;
164b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    mutable bool mIsCorrupted;
1651fb11da36ab279fa4fcc62d772d9cce877bf23bdKeisuke Kuroynagi
16665a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi    int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId,
16765a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi            const int maxCodePointCount, int *const outCodePoints,
16865a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi            int *const outUnigramProbability) const;
169847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi    int getShortcutPositionOfPtNode(const int ptNodePos) const;
170b00973952f269ebee6d1d5f808fad7ca64fb9954Keisuke Kuroyanagi    int getBigramsPositionOfPtNode(const int ptNodePos) const;
17177ef75cbe6722d1eb45115c1ad82f963444d71cdKeisuke Kuroyanagi    int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
1727fd9667d76cdc6febe32545865648ea90dc28904Keisuke Kuroyanagi            DicNodeVector *const childDicNodes) const;
17389a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi    int getWordIdFromTerminalPtNodePos(const int ptNodePos) const;
17489a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi    int getTerminalPtNodePosFromWordId(const int wordId) const;
1752111e3abc9c9c0ea0350b8470532bf636b78cdd7Keisuke Kuroyanagi    const WordAttributes getWordAttributes(const int probability,
1762111e3abc9c9c0ea0350b8470532bf636b78cdd7Keisuke Kuroyanagi            const PtNodeParams &ptNodeParams) const;
177180e7b4c07f8c17862b9bdcdee0bc4b4413328a9Keisuke Kuroyanagi    bool isValidPos(const int pos) const;
178c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi};
179c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi} // namespace latinime
180c5e6efafff56c57c5527fe64dddb851df0719634Keisuke Kuroynagi#endif // LATINIME_PATRICIA_TRIE_POLICY_H
181