ver4_patricia_trie_policy.h revision 537f6eea8a8d56fe532913a37f4dbff4b3d490af
12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/*
22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project
32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License");
52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License.
62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at
72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *     http://www.apache.org/licenses/LICENSE-2.0
92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software
112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS,
122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and
142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License.
152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */
162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
20941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi#include <vector>
21941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi
222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h"
232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/header/header_policy.h"
252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
266e4b674f83e0c287e00bfe6546db2a1f93daf5f0Keisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
311d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
336ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h"
342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime {
362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNode;
382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNodeVector;
392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
407d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Artificial id that is stored in the PtNode looked up by the word.
412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public:
434ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi            : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi              mDictBuffer(mBuffers->getWritableTrieBuffer()),
464ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi              mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
474ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi                      mBuffers->getTerminalPositionLookupTable()),
48851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi              mNodeReader(mDictBuffer, mBuffers->getLanguageModelDictContent(), mHeaderPolicy),
491d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi              mPtNodeArrayReader(mDictBuffer),
5057816c7a8bac1a47913da7a503ece2b5dd7cc0fcKeisuke Kuroyanagi              mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader,
51fe395232d69df0887863c1cbabe63def2586d29eKeisuke Kuroyanagi                      &mPtNodeArrayReader, &mShortcutPolicy),
522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mWritingHelper(mBuffers.get()),
542fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mUnigramCount(mHeaderPolicy->getUnigramCount()),
55941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi              mBigramCount(mHeaderPolicy->getBigramCount()),
56b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi              mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    AK_FORCE_INLINE int getRootPosition() const {
592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return 0;
602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void createAndGetAllChildDicNodes(const DicNode *const dicNode,
632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            DicNodeVector *const childDicNodes) const;
642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int getCodePointsAndProbabilityAndReturnCodePointCount(
6694e4cd25a8f7417d30a0832f7476d39ece1df788Keisuke Kuroyanagi            const int wordId, const int maxCodePointCount, int *const outCodePoints,
672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            int *const outUnigramProbability) const;
682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
6989a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi    int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
71537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            const int wordId, MultiBigramMap *const multiBigramMap) const;
739f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi
74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    // TODO: Remove
75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbability(const int unigramProbability, const int bigramProbability) const {
76537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi        // Not used.
77537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi        return NOT_A_PROBABILITY;
78537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    }
792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
80537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
812fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
82537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    void iterateNgramEntries(const WordIdArrayView prevWordIds,
83537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            NgramListener *const listener) const;
842d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi
85ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mHeaderPolicy;
892fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
902fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
916ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool addUnigramEntry(const CodePointArrayView wordCodePoints,
92793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi            const UnigramProperty *const unigramProperty);
932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
946ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
95f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi
969f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
97620a05ae59ec9f7be39557094fc306c51c712ca1Keisuke Kuroyanagi            const BigramProperty *const bigramProperty);
982fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
996ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
1006ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi            const CodePointArrayView wordCodePoints);
1012fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
102dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flush(const char *const filePath);
1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
104dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flushWithGC(const char *const filePath);
1052fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    bool needsToRunGC(const bool mindsBlockByGC) const;
1072fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1082fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void getProperty(const char *const query, const int queryLength, char *const outResult,
1092fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            const int maxResultLength);
1102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1116ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
1122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
113f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi    int getNextWordAndNextToken(const int token, int *const outCodePoints,
114f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi            int *const outCodePointCount);
11538f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi
116b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    bool isCorrupted() const {
117b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi        return mIsCorrupted;
118b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    }
119b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi
1202fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private:
1212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy);
1222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const UNIGRAM_COUNT_QUERY;
1242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const BIGRAM_COUNT_QUERY;
1252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const MAX_UNIGRAM_COUNT_QUERY;
1262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const MAX_BIGRAM_COUNT_QUERY;
1272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // When the dictionary size is near the maximum size, we have to refuse dynamic operations to
1282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // prevent the dictionary from overflowing.
1292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
1302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
1312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1324ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
1332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const HeaderPolicy *const mHeaderPolicy;
1342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    BufferWithExtendableBuffer *const mDictBuffer;
1352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4ShortcutListPolicy mShortcutPolicy;
1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieNodeReader mNodeReader;
1371d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi    Ver4PtNodeArrayReader mPtNodeArrayReader;
1382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieNodeWriter mNodeWriter;
1392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DynamicPtUpdatingHelper mUpdatingHelper;
1402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieWritingHelper mWritingHelper;
1412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int mUnigramCount;
1422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int mBigramCount;
143941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi    std::vector<int> mTerminalPtNodePositionsForIteratingWords;
144b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    mutable bool mIsCorrupted;
145847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi
146ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    int getShortcutPositionOfWord(const int wordId) const;
1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa};
1482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime
1492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
150